diff --git a/.clang-format-ignore b/.clang-format-ignore
index 4a9d3656fd9..dd0b7408c8b 100644
--- a/.clang-format-ignore
+++ b/.clang-format-ignore
@@ -67,6 +67,7 @@ tests/tt_metal/test_utils/env_vars.hpp
 tests/tt_metal/tt_metal/api/allocator/test_free_list_opt_allocator.cpp
 tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
 tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp
+tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json
 tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
 tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
 tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp
diff --git a/.github/actions/prepare-metal-run/action.yml b/.github/actions/prepare-metal-run/action.yml
index 874c0223d37..a6784eddc4f 100644
--- a/.github/actions/prepare-metal-run/action.yml
+++ b/.github/actions/prepare-metal-run/action.yml
@@ -2,9 +2,6 @@ name: Prepare Metal Run
 description: "Installs Python Dependencies from cache or from PyPI if cache is not available."
 
 inputs:
-  arch:
-    description: "The architecture to use"
-    required: true
   is_profiler:
     description: "Whether to load with profiler"
     required: false
@@ -24,14 +21,14 @@ runs:
     - uses: actions/download-artifact@v4
       if: ${{ inputs.is_profiler == 'false' }}
       with:
-        name: TTMetal_build_${{ inputs.arch }}
+        name: TTMetal_build_any
     - uses: actions/download-artifact@v4
       if: ${{ inputs.is_profiler == 'true' }}
       with:
-        name: TTMetal_build_${{ inputs.arch }}_profiler
+        name: TTMetal_build_any_profiler
     - name: Extract files
       shell: bash
-      run: tar -xvf ttm_${{ inputs.arch }}.tar
+      run: tar -xvf ttm_any.tar
     - uses: ./.github/actions/install-python-deps
       with:
         python-version: ${{ inputs.python-version }}
diff --git a/.github/workflows/_build-wheels-impl.yaml b/.github/workflows/_build-wheels-impl.yaml
index 239729947f0..70e211af017 100644
--- a/.github/workflows/_build-wheels-impl.yaml
+++ b/.github/workflows/_build-wheels-impl.yaml
@@ -6,9 +6,6 @@ on:
       os:
         required: True
         type: string
-      arch:
-        required: True
-        type: string
       from-precompiled:
         required: True
         default: True
@@ -17,8 +14,6 @@ on:
 jobs:
   build-wheel:
     runs-on: ${{ inputs.os }}
-    env:
-      ARCH_NAME: ${{ inputs.arch }}
     steps:
       - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
         with:
@@ -57,7 +52,6 @@ jobs:
       - uses: ./.github/actions/prepare-metal-run
         if: ${{ inputs.from-precompiled }}
         with:
-          arch: ${{ inputs.arch }}
           python-version: ${{ env.python-version }}
       - name: Set precompiled dir for precompile builds
         if: ${{ inputs.from-precompiled }}
@@ -69,5 +63,5 @@ jobs:
       - name: Upload distribution as artifact
         uses: actions/upload-artifact@v4
         with:
-          name: eager-dist-${{ inputs.os }}-${{ inputs.arch }}
+          name: eager-dist-${{ inputs.os }}-any
           path: dist/
diff --git a/.github/workflows/_test-wheels-impl.yaml b/.github/workflows/_test-wheels-impl.yaml
index 6049068510c..b61afa66161 100644
--- a/.github/workflows/_test-wheels-impl.yaml
+++ b/.github/workflows/_test-wheels-impl.yaml
@@ -37,7 +37,7 @@ jobs:
           os: ${{ matrix.os }}
       - uses: actions/download-artifact@v4
         with:
-          name: eager-dist-${{ matrix.os }}-${{ matrix.runner-hw-info.arch }}
+          name: eager-dist-${{ matrix.os }}-any
       - name: Set up end-to-end tests environment
         run: ./tests/scripts/set_up_end_to_end_tests_env.sh
       - name: Activate env and run release tests - host
@@ -61,7 +61,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/download-artifact@v4
         with:
-          name: eager-dist-${{ matrix.os }}-${{ matrix.runner-hw-info.arch }}
+          name: eager-dist-${{ matrix.os }}-any
       - name: Set up end-to-end tests environment
         run: ./tests/scripts/set_up_end_to_end_tests_env.sh
       - name: Activate env and run release tests - silicon
diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml
index 09072086616..f42e4d21e9b 100644
--- a/.github/workflows/all-post-commit-workflows.yaml
+++ b/.github/workflows/all-post-commit-workflows.yaml
@@ -40,11 +40,9 @@ jobs:
         # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
         # The full 22.04 flow can be tested without precompiled
         os: [ubuntu-20.04]
-        arch: [grayskull, wormhole_b0]
     uses: ./.github/workflows/_build-wheels-impl.yaml
     with:
       os: ${{ matrix.os }}
-      arch: ${{ matrix.arch }}
       from-precompiled: true
     secrets: inherit
   test-wheels:
@@ -74,9 +72,9 @@ jobs:
     needs: build-docker-image-2004
     uses: ./.github/workflows/build-artifact.yaml
     with:
-      tracy: true
       build-docker: false
       build-type: ${{ inputs.build-type || 'Release' }}
+      tracy: true
     secrets: inherit
   # Slow Dispatch Unit Tests
   sd-unit-tests:
diff --git a/.github/workflows/bisect-dispatch.yaml b/.github/workflows/bisect-dispatch.yaml
index 2cdfc3b17b4..12bda76c1fc 100644
--- a/.github/workflows/bisect-dispatch.yaml
+++ b/.github/workflows/bisect-dispatch.yaml
@@ -31,8 +31,6 @@ run-name: ${{ inputs.description }}
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '[ "${{ inputs.arch }}" ]'
     secrets: inherit
   test-dispatch:
     needs: build-artifact
@@ -50,9 +48,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ inputs.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ inputs.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run pre/post regression tests in a loop
         run: |
diff --git a/.github/workflows/blackhole-post-commit.yaml b/.github/workflows/blackhole-post-commit.yaml
index 72ba467fa92..2a3158a42a8 100644
--- a/.github/workflows/blackhole-post-commit.yaml
+++ b/.github/workflows/blackhole-post-commit.yaml
@@ -32,20 +32,13 @@ jobs:
     secrets: inherit
     with:
       os: "ubuntu-22.04-amd64"
-      arch: '["blackhole"]'
       build-docker: false
   build-wheels:
     needs: build-artifact
     uses: ./.github/workflows/_build-wheels-impl.yaml
     with:
       os: "ubuntu-22.04"
-      arch: "blackhole"
       from-precompiled: true
-#   build-artifact-profiler:
-#     uses: ./.github/workflows/build-artifact.yaml
-#     with:
-#       profiler-build: true
-#     secrets: inherit
   umd-unit-tests:
     secrets: inherit
     uses: ./.github/workflows/umd-unit-tests.yaml
diff --git a/.github/workflows/build-and-test-wheels.yaml b/.github/workflows/build-and-test-wheels.yaml
index 3f2385121a2..d21c08d1f76 100644
--- a/.github/workflows/build-and-test-wheels.yaml
+++ b/.github/workflows/build-and-test-wheels.yaml
@@ -23,11 +23,9 @@ jobs:
         # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
         # The full 22.04 flow can be tested without precompiled
         os: ${{ fromJson((github.event_name == 'schedule' || inputs.from-precompiled) && '["ubuntu-20.04"]' || '["ubuntu-20.04", "ubuntu-22.04"]') }}
-        arch: [grayskull, wormhole_b0]
     uses: ./.github/workflows/_build-wheels-impl.yaml
     with:
       os: ${{ matrix.os }}
-      arch: ${{ matrix.arch }}
       from-precompiled: ${{ inputs.from-precompiled }}
   test-wheels:
     needs: build-wheels
diff --git a/.github/workflows/build-and-unit-tests.yaml b/.github/workflows/build-and-unit-tests.yaml
index ef77466c208..489c4e75d8a 100644
--- a/.github/workflows/build-and-unit-tests.yaml
+++ b/.github/workflows/build-and-unit-tests.yaml
@@ -76,8 +76,6 @@ jobs:
     steps:
       - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: ${{ inputs.arch }}
       - name: ${{ matrix.test-group.name }} tests
         timeout-minutes: ${{ inputs.timeout }}
         uses: ./.github/actions/docker-run
diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml
index 5add3efdde1..e1e436fadc4 100644
--- a/.github/workflows/build-artifact.yaml
+++ b/.github/workflows/build-artifact.yaml
@@ -3,10 +3,6 @@ name: "Build tt-metal artifacts"
 on:
   workflow_call:
     inputs:
-      arch:
-        required: false
-        type: string
-        default: '["grayskull", "wormhole_b0"]'
       build-type:
         required: false
         type: string
@@ -32,10 +28,6 @@ on:
         type: boolean
         default: true
         description: "Build docker image"
-      arch:
-        required: false
-        type: string
-        default: '["grayskull", "wormhole_b0"]'
       build-type:
         required: false
         type: string
@@ -63,12 +55,7 @@ jobs:
     needs: build-docker-image
     if: always()
     timeout-minutes: 30
-    strategy:
-      matrix:
-        arch: ${{ fromJson(inputs.arch || '["grayskull", "wormhole_b0", "blackhole"]') }}
     env:
-      TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
-      ARCH_NAME: ${{ matrix.arch }}
       SILENT: 0
       VERBOSE: 1
     runs-on:
@@ -123,7 +110,6 @@ jobs:
             -v /etc/bashrc:/etc/bashrc:ro
             -v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache
             -v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache
-            -e ARCH_NAME=${{ matrix.arch }}
             -e CARGO_HOME=${{ github.workspace }}/.cargo
             -w ${{ github.workspace }}
           run: |
@@ -151,9 +137,9 @@ jobs:
           cat build/ccache.stats >> $GITHUB_STEP_SUMMARY
           echo '```' >> $GITHUB_STEP_SUMMARY
       - name: 'Tar files'
-        run: tar -cvhf ttm_${{ matrix.arch }}.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train data runtime
+        run: tar -cvhf ttm_any.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train data runtime
       - name: 'Upload Artifact'
         uses: actions/upload-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.arch }}${{ (inputs.tracy && '_profiler') || '' }}
-          path: ttm_${{ matrix.arch }}.tar
+          name: TTMetal_build_any${{ (inputs.tracy && '_profiler') || '' }}
+          path: ttm_any.tar
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index abd651be8ca..dbebda60b94 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -20,13 +20,11 @@ jobs:
           #{type: RelWithDebInfo,  cxx_compiler: g++-12, c_compiler: gcc-12, runs-on: ["build", "in-service"], os: ubuntu-22.04},
           {type: Release,  cxx_compiler: g++-12, c_compiler: gcc-12, runs-on: ["build", "in-service"], os: ubuntu-22.04},
         ]
-        arch: [grayskull, wormhole_b0, blackhole]
     env:
-      ARCH_NAME: ${{ matrix.arch }}
       # So we can get all the makefile output we want
       VERBOSE: 1
     runs-on: ${{ matrix.build.runs-on }}
-    name: ${{ matrix.build.type }} ${{ matrix.build.cxx_compiler }} ${{ matrix.arch }} ${{ matrix.build.os }}
+    name: ${{ matrix.build.type }} ${{ matrix.build.cxx_compiler }} any ${{ matrix.build.os }}
     steps:
       - name: Verify ccache availability
         shell: bash
@@ -51,7 +49,6 @@ jobs:
           docker_username: ${{ github.actor }}
           docker_password: ${{ secrets.GITHUB_TOKEN }}
           docker_opts: |
-            -e ARCH_NAME=${{ matrix.arch }}
             --group-add 1457
             -v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache
             -e CCACHE_DIR=/home/ubuntu/.ccache
diff --git a/.github/workflows/cpp-post-commit.yaml b/.github/workflows/cpp-post-commit.yaml
index ff8bb335639..c716f23f796 100644
--- a/.github/workflows/cpp-post-commit.yaml
+++ b/.github/workflows/cpp-post-commit.yaml
@@ -81,8 +81,6 @@ jobs:
     steps:
       - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: ${{ inputs.arch }}
       - name: ${{ matrix.test-group.name }} tests
         #GH Issue 16167
         if: ${{ !(inputs.runner-label == 'BH' && matrix.test-group.name == 'tools') }}
diff --git a/.github/workflows/docs-latest-public.yaml b/.github/workflows/docs-latest-public.yaml
index 7737395d5d5..2afe136086e 100644
--- a/.github/workflows/docs-latest-public.yaml
+++ b/.github/workflows/docs-latest-public.yaml
@@ -42,9 +42,9 @@ jobs:
           os: ubuntu-20.04
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Build Docs
         timeout-minutes: 15
diff --git a/.github/workflows/docs-release.yaml b/.github/workflows/docs-release.yaml
index b6c21291ca9..b68f85e61b2 100644
--- a/.github/workflows/docs-release.yaml
+++ b/.github/workflows/docs-release.yaml
@@ -51,9 +51,9 @@ jobs:
           os: ubuntu-20.04
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Build Doxygen Docs
         timeout-minutes: 15
diff --git a/.github/workflows/fast-dispatch-build-and-unit-tests-wrapper.yaml b/.github/workflows/fast-dispatch-build-and-unit-tests-wrapper.yaml
index cfbaf686cd5..c3e1c4f3879 100644
--- a/.github/workflows/fast-dispatch-build-and-unit-tests-wrapper.yaml
+++ b/.github/workflows/fast-dispatch-build-and-unit-tests-wrapper.yaml
@@ -11,9 +11,21 @@ jobs:
     needs: build-docker-artifact
     uses: ./.github/workflows/build-artifact.yaml
     secrets: inherit
+  build-wheels:
+    needs: build-artifact
+    strategy:
+      matrix:
+        # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
+        # The full 22.04 flow can be tested without precompiled
+        os: [ubuntu-20.04]
+    uses: ./.github/workflows/_build-wheels-impl.yaml
+    with:
+      os: ${{ matrix.os }}
+      from-precompiled: true
+    secrets: inherit
   # FD Unit Tests
   fast-dispatch-unit-tests:
-    needs: build-artifact
+    needs: build-wheels
     secrets: inherit
     strategy:
       fail-fast: false
@@ -29,7 +41,7 @@ jobs:
       runner-label: ${{ matrix.test-group.runner-label}}
   # TTNN FD Unit tests
   ttnn-unit-tests:
-    needs: build-artifact
+    needs: build-wheels
     secrets: inherit
     strategy:
       fail-fast: false
@@ -46,7 +58,7 @@ jobs:
 
   # FD Model Tests
   models-unit-tests:
-    needs: build-artifact
+    needs: build-wheels
     secrets: inherit
     strategy:
       fail-fast: false
@@ -63,7 +75,7 @@ jobs:
 
   # FD C++ Unit Tests
   cpp-unit-tests:
-    needs: build-artifact
+    needs: build-wheels
     secrets: inherit
     strategy:
       fail-fast: false
diff --git a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml
index 2c55a940034..8042f7cd7ca 100644
--- a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml
+++ b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml
@@ -73,7 +73,7 @@ jobs:
       - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
       - uses: actions/download-artifact@v4
         with:
-          name: eager-dist-${{ matrix.os }}-${{ inputs.arch }}
+          name: eager-dist-${{ matrix.os }}-any
       - name: ${{ matrix.test-group.name }} tests
         timeout-minutes: ${{ inputs.timeout }}
         uses: ./.github/actions/docker-run
@@ -81,6 +81,8 @@ jobs:
           docker_os_arch: tt-metalium/${{ inputs.os }}-amd64
           install_wheel: true
           docker_password: ${{ secrets.GITHUB_TOKEN }}
+          docker_opts: |
+            -e ARCH_NAME=${{ inputs.arch }}
           run_args: |
             ${{ matrix.test-group.cmd }}
       - uses: ./.github/actions/slack-report
diff --git a/.github/workflows/fast-dispatch-frequent-tests-impl.yaml b/.github/workflows/fast-dispatch-frequent-tests-impl.yaml
index 2dbc84d446b..e5dcf724344 100644
--- a/.github/workflows/fast-dispatch-frequent-tests-impl.yaml
+++ b/.github/workflows/fast-dispatch-frequent-tests-impl.yaml
@@ -16,15 +16,13 @@ jobs:
       fail-fast: false
       matrix:
         test-group:
-          [
-            {
-              name: "WH N300 pgm dispatch nightly",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N300", "in-service"],
-              cmd: ./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.sh,
-              timeout: 10
-            },
-          ]
+          - name: "WH N300 pgm dispatch nightly"
+            arch: wormhole_b0
+            runs-on: ["cloud-virtual-machine", "N300", "in-service"]
+            run-args: |
+              ./build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_wormhole_b0 --benchmark_out_format=json --benchmark_out=bench.json
+              ./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.py bench.json
+            timeout: 10
     name: ${{ matrix.test-group.name }}
     env:
       LOGURU_LEVEL: INFO
@@ -32,8 +30,6 @@ jobs:
     steps:
       - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: ${{ matrix.test-group.arch }}
       - name: ${{ matrix.test-group.name }} tests
         timeout-minutes: ${{ matrix.test-group.timeout }}
         uses: ./.github/actions/docker-run
@@ -44,8 +40,7 @@ jobs:
             -e TT_METAL_HOME=${{ github.workspace }}
             -e ARCH_NAME=${{ matrix.test-group.arch }}
             -e LD_LIBRARY_PATH=${{ github.workspace }}/build/lib
-          run_args: |
-            ${{ matrix.test-group.cmd }}
+          run_args: ${{ matrix.test-group.run-args }}
       - uses: ./.github/actions/slack-report
         if: ${{ failure() }}
         with:
@@ -57,3 +52,9 @@ jobs:
           path: |
             generated/test_reports/
           prefix: "test_reports_"
+      - uses: ./.github/actions/upload-artifact-with-job-uuid
+        if: ${{ !cancelled() }}
+        with:
+          path: |
+            bench.json
+          prefix: "pgm_benchmarks_json_"
diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
index 8b71190eb2b..196bfe013f7 100644
--- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
+++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
@@ -66,8 +66,6 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: ${{ matrix.test-group.arch }}
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent reg tests scripts
         timeout-minutes: ${{ matrix.test-group.timeout }}
@@ -112,8 +110,6 @@ jobs:
         run: |
           echo "WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml" >> $GITHUB_ENV
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: wormhole_b0
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent reg tests scripts
         timeout-minutes: 30
@@ -179,8 +175,6 @@ jobs:
         run: |
           echo "WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml" >> $GITHUB_ENV
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: wormhole_b0
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent reg tests scripts
         timeout-minutes: 60
diff --git a/.github/workflows/full-regressions-and-models.yaml b/.github/workflows/full-regressions-and-models.yaml
index b5c4cb3a483..0c424f5e4f5 100644
--- a/.github/workflows/full-regressions-and-models.yaml
+++ b/.github/workflows/full-regressions-and-models.yaml
@@ -34,9 +34,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent reg tests scripts
         timeout-minutes: 210
diff --git a/.github/workflows/models-post-commit-wrapper.yaml b/.github/workflows/models-post-commit-wrapper.yaml
index 86533af4570..ccdccc25a4a 100644
--- a/.github/workflows/models-post-commit-wrapper.yaml
+++ b/.github/workflows/models-post-commit-wrapper.yaml
@@ -22,11 +22,9 @@ jobs:
         # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
         # The full 22.04 flow can be tested without precompiled
         os: [ubuntu-20.04]
-        arch: [grayskull, wormhole_b0]
     uses: ./.github/workflows/_build-wheels-impl.yaml
     with:
       os: ${{ matrix.os }}
-      arch: ${{ matrix.arch }}
       from-precompiled: true
     secrets: inherit
   models-unit-tests:
diff --git a/.github/workflows/models-post-commit.yaml b/.github/workflows/models-post-commit.yaml
index 8e14f413db4..6784790f115 100644
--- a/.github/workflows/models-post-commit.yaml
+++ b/.github/workflows/models-post-commit.yaml
@@ -63,13 +63,15 @@ jobs:
           command: ./.github/scripts/cloud_utils/mount_weka.sh
       - uses: actions/download-artifact@v4
         with:
-          name: eager-dist-${{ matrix.os }}-${{ inputs.arch }}
+          name: eager-dist-${{ matrix.os }}-any
       - name: ${{ matrix.test-group.name }} tests
         timeout-minutes: ${{ inputs.timeout }}
         uses: ./.github/actions/docker-run
         with:
           install_wheel: true
           docker_password: ${{ secrets.GITHUB_TOKEN }}
+          docker_opts: |
+            -e ARCH_NAME=${{ inputs.arch }}
           run_args: |
             source tests/scripts/run_python_model_tests.sh && run_python_model_tests_${{ inputs.arch }}
       - uses: ./.github/actions/slack-report
diff --git a/.github/workflows/package-and-release.yaml b/.github/workflows/package-and-release.yaml
index d695c8245e9..b8834c1a6f5 100644
--- a/.github/workflows/package-and-release.yaml
+++ b/.github/workflows/package-and-release.yaml
@@ -123,11 +123,9 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-20.04]
-        arch: [grayskull, wormhole_b0]
     uses: ./.github/workflows/_build-wheels-impl.yaml
     with:
       os: ${{ matrix.os }}
-      arch: ${{ matrix.arch }}
       from-precompiled: false
   # Candidate for breaking up
   create-and-upload-draft-release:
@@ -139,7 +137,6 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-20.04]
-        arch: [grayskull, wormhole_b0]
     # May accidentally create two releases without restricting to 1 job
     concurrency: create_upload_draft_release
     runs-on: ubuntu-latest
@@ -149,7 +146,7 @@ jobs:
       - name: Download eager Python packages
         uses: actions/download-artifact@v4
         with:
-          name: eager-dist-${{ matrix.os }}-${{ matrix.arch }}
+          name: eager-dist-${{ matrix.os }}-any
       - name: Create VERSION
         run: echo ${{ needs.create-tag.outputs.version }} > VERSION
       - name : Download release notes
diff --git a/.github/workflows/perf-device-models-impl.yaml b/.github/workflows/perf-device-models-impl.yaml
index a95f650a568..43610aa2cfd 100644
--- a/.github/workflows/perf-device-models-impl.yaml
+++ b/.github/workflows/perf-device-models-impl.yaml
@@ -34,7 +34,6 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: ./.github/actions/prepare-metal-run
         with:
-          arch: ${{ matrix.test-info.arch }}
           is_profiler: 'true'
       - name: ${{ matrix.test-group.name }} tests
         timeout-minutes: ${{ matrix.test-info.timeout }}
diff --git a/.github/workflows/perf-models-impl.yaml b/.github/workflows/perf-models-impl.yaml
index 13159b61f16..153e303001e 100644
--- a/.github/workflows/perf-models-impl.yaml
+++ b/.github/workflows/perf-models-impl.yaml
@@ -34,9 +34,9 @@ jobs:
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-info.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-info.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run performance regressions
         id: performance_tests
diff --git a/.github/workflows/publish-release-image-wrapper.yaml b/.github/workflows/publish-release-image-wrapper.yaml
index cc22e9f23c8..45ff119d4d4 100644
--- a/.github/workflows/publish-release-image-wrapper.yaml
+++ b/.github/workflows/publish-release-image-wrapper.yaml
@@ -13,11 +13,9 @@ jobs:
         # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
         # The full 22.04 flow can be tested without precompiled
         os: [ubuntu-20.04]
-        arch: [grayskull, wormhole_b0]
     uses: ./.github/workflows/_build-wheels-impl.yaml
     with:
       os: ${{ matrix.os }}
-      arch: ${{ matrix.arch }}
       from-precompiled: true
   publish-release-image:
     needs: build-wheels
diff --git a/.github/workflows/publish-release-image.yaml b/.github/workflows/publish-release-image.yaml
index 64f8a2f3d29..586cb2c79a3 100644
--- a/.github/workflows/publish-release-image.yaml
+++ b/.github/workflows/publish-release-image.yaml
@@ -37,7 +37,7 @@ jobs:
       - name: Download wheels
         uses: actions/download-artifact@v4
         with:
-          name: eager-dist-${{ matrix.os }}-${{ matrix.arch }}
+          name: eager-dist-${{ matrix.os }}-any
       - name: Get the name of the wheel and set up env variables
         id: generate-tag-name
         run: |
diff --git a/.github/workflows/run-profiler-regression.yaml b/.github/workflows/run-profiler-regression.yaml
index 07290e2fc8a..adbef02dea0 100644
--- a/.github/workflows/run-profiler-regression.yaml
+++ b/.github/workflows/run-profiler-regression.yaml
@@ -31,9 +31,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.runner-info.arch }}_profiler
+          name: TTMetal_build_any_profiler
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run profiler regression tests
         timeout-minutes: 30
diff --git a/.github/workflows/single-card-demo-tests-impl.yaml b/.github/workflows/single-card-demo-tests-impl.yaml
index 43780149629..3368012b3b9 100644
--- a/.github/workflows/single-card-demo-tests-impl.yaml
+++ b/.github/workflows/single-card-demo-tests-impl.yaml
@@ -46,8 +46,6 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: ${{ matrix.test-group.arch }}
       - uses: ./.github/actions/install-python-deps
       - name: Run demo regression tests
         timeout-minutes: 70
diff --git a/.github/workflows/single-card-demo-tests.yaml b/.github/workflows/single-card-demo-tests.yaml
index ef7c101d8fb..0e98fd9052a 100644
--- a/.github/workflows/single-card-demo-tests.yaml
+++ b/.github/workflows/single-card-demo-tests.yaml
@@ -10,8 +10,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   single-card-demo-tests:
     needs: build-artifact
diff --git a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
index c45b33ccccf..205e86cceb9 100644
--- a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
+++ b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
@@ -41,9 +41,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.runner-info.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run pre/post regression tests in a loop
         run: |
diff --git a/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml
index 1976249eba1..ce01df49a5c 100644
--- a/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml
+++ b/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml
@@ -40,9 +40,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run pre/post regression tests in a loop
         run: |
diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml
index 9ad4ab1b818..744f6475d44 100644
--- a/.github/workflows/t3000-demo-tests-impl.yaml
+++ b/.github/workflows/t3000-demo-tests-impl.yaml
@@ -44,9 +44,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run demo regression tests
         shell: bash {0}
diff --git a/.github/workflows/t3000-demo-tests.yaml b/.github/workflows/t3000-demo-tests.yaml
index 5ed80a3861d..9d1a5ad7e57 100644
--- a/.github/workflows/t3000-demo-tests.yaml
+++ b/.github/workflows/t3000-demo-tests.yaml
@@ -9,8 +9,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   t3000-demo-tests:
     needs: build-artifact
diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml
index 11a2df7b146..f538f9ba3cf 100644
--- a/.github/workflows/t3000-frequent-tests-impl.yaml
+++ b/.github/workflows/t3000-frequent-tests-impl.yaml
@@ -46,9 +46,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent regression tests
         shell: bash {0}
diff --git a/.github/workflows/t3000-frequent-tests.yaml b/.github/workflows/t3000-frequent-tests.yaml
index dd56ffe0aa1..8ab4ed51dbe 100644
--- a/.github/workflows/t3000-frequent-tests.yaml
+++ b/.github/workflows/t3000-frequent-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   t3000-frequent-tests:
     needs: build-artifact
diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
index 387a18d15a2..d63b96dd421 100644
--- a/.github/workflows/t3000-model-perf-tests-impl.yaml
+++ b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -49,17 +49,17 @@ jobs:
         if: ${{ matrix.test-group.tracy }}
         uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+          name: TTMetal_build_any_profiler
         continue-on-error: true
       - name: Download build artifact
         id: download-artifact
         if: ${{ !matrix.test-group.tracy }}
         uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
         if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run model perf regression tests
         if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml
index 15d96746889..aa31d66e875 100644
--- a/.github/workflows/t3000-model-perf-tests.yaml
+++ b/.github/workflows/t3000-model-perf-tests.yaml
@@ -8,13 +8,10 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   build-artifact-profiler:
     uses: ./.github/workflows/build-artifact.yaml
     with:
-      arch: '["wormhole_b0"]'
       tracy: true
     secrets: inherit
   t3000-model-perf-tests:
diff --git a/.github/workflows/t3000-nightly-tests-impl.yaml b/.github/workflows/t3000-nightly-tests-impl.yaml
index 7b445b3204b..d2bc182e92f 100644
--- a/.github/workflows/t3000-nightly-tests-impl.yaml
+++ b/.github/workflows/t3000-nightly-tests-impl.yaml
@@ -35,9 +35,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run demo regression tests
         shell: bash {0}
diff --git a/.github/workflows/t3000-nightly-tests.yaml b/.github/workflows/t3000-nightly-tests.yaml
index 58944fa282f..a62267b3b12 100644
--- a/.github/workflows/t3000-nightly-tests.yaml
+++ b/.github/workflows/t3000-nightly-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   t3000-nightly-tests:
     needs: build-artifact
diff --git a/.github/workflows/t3000-perplexity-tests-impl.yaml b/.github/workflows/t3000-perplexity-tests-impl.yaml
index 6779624d550..9b6384bb491 100644
--- a/.github/workflows/t3000-perplexity-tests-impl.yaml
+++ b/.github/workflows/t3000-perplexity-tests-impl.yaml
@@ -34,9 +34,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run perplexity tests
         shell: bash {0}
diff --git a/.github/workflows/t3000-perplexity-tests.yaml b/.github/workflows/t3000-perplexity-tests.yaml
index c7d8f2d16ea..680a564d646 100644
--- a/.github/workflows/t3000-perplexity-tests.yaml
+++ b/.github/workflows/t3000-perplexity-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   t3000-model-accuracy-perplexity-tests:
     needs: build-artifact
diff --git a/.github/workflows/t3000-profiler-tests-impl.yaml b/.github/workflows/t3000-profiler-tests-impl.yaml
index c1d5cf01247..d9847249087 100644
--- a/.github/workflows/t3000-profiler-tests-impl.yaml
+++ b/.github/workflows/t3000-profiler-tests-impl.yaml
@@ -35,9 +35,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+          name: TTMetal_build_any_profiler
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run profiler regression tests
         timeout-minutes: 30
diff --git a/.github/workflows/t3000-profiler-tests.yaml b/.github/workflows/t3000-profiler-tests.yaml
index ccc9dda2876..08d5f6ea0dd 100644
--- a/.github/workflows/t3000-profiler-tests.yaml
+++ b/.github/workflows/t3000-profiler-tests.yaml
@@ -10,7 +10,6 @@ jobs:
   build-artifact-profiler:
     uses: ./.github/workflows/build-artifact.yaml
     with:
-      arch: '["wormhole_b0"]'
       tracy: true
     secrets: inherit
   t3000-profiler-tests:
diff --git a/.github/workflows/t3000-unit-tests-impl.yaml b/.github/workflows/t3000-unit-tests-impl.yaml
index f983a14b43a..ea077571775 100644
--- a/.github/workflows/t3000-unit-tests-impl.yaml
+++ b/.github/workflows/t3000-unit-tests-impl.yaml
@@ -47,9 +47,9 @@ jobs:
       - uses: ./.github/actions/ensure-active-weka-mount
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run unit regression tests
         shell: bash {0}
diff --git a/.github/workflows/t3000-unit-tests.yaml b/.github/workflows/t3000-unit-tests.yaml
index c753e82c4ac..9950b40a295 100644
--- a/.github/workflows/t3000-unit-tests.yaml
+++ b/.github/workflows/t3000-unit-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   t3000-unit-tests:
     needs: build-artifact
diff --git a/.github/workflows/test-dispatch.yaml b/.github/workflows/test-dispatch.yaml
index c0b7ef92c91..d14ec14f6df 100644
--- a/.github/workflows/test-dispatch.yaml
+++ b/.github/workflows/test-dispatch.yaml
@@ -52,7 +52,6 @@ jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
     with:
-      arch: '[ "${{ inputs.arch }}" ]'
       build-type: ${{ inputs.build-type }}
       tracy: ${{ inputs.tracy }}
     secrets: inherit
@@ -75,9 +74,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name:  TTMetal_build_${{ inputs.arch }}${{ (inputs.tracy && '_profiler') || '' }}
+          name:  TTMetal_build_any${{ (inputs.tracy && '_profiler') || '' }}
       - name: Extract files
-        run: tar -xvf ttm_${{ inputs.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run pre/post regression tests in a loop
         run: |
diff --git a/.github/workflows/tg-demo-tests-impl.yaml b/.github/workflows/tg-demo-tests-impl.yaml
index f4956749a5e..b5547d2abd6 100644
--- a/.github/workflows/tg-demo-tests-impl.yaml
+++ b/.github/workflows/tg-demo-tests-impl.yaml
@@ -30,9 +30,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run demo regression tests
         timeout-minutes: ${{ matrix.test-group.timeout }}
diff --git a/.github/workflows/tg-demo-tests.yaml b/.github/workflows/tg-demo-tests.yaml
index b8e31e4c49d..343b047db67 100644
--- a/.github/workflows/tg-demo-tests.yaml
+++ b/.github/workflows/tg-demo-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   tg-demo-tests:
     needs: build-artifact
diff --git a/.github/workflows/tg-frequent-tests-impl.yaml b/.github/workflows/tg-frequent-tests-impl.yaml
index fbc89ab24d0..a1577350e10 100644
--- a/.github/workflows/tg-frequent-tests-impl.yaml
+++ b/.github/workflows/tg-frequent-tests-impl.yaml
@@ -32,9 +32,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent regression tests
         timeout-minutes: ${{ matrix.test-group.timeout }}
diff --git a/.github/workflows/tg-frequent-tests.yaml b/.github/workflows/tg-frequent-tests.yaml
index 85fb5e16342..285d65e5e27 100644
--- a/.github/workflows/tg-frequent-tests.yaml
+++ b/.github/workflows/tg-frequent-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   tg-frequent-tests:
     needs: build-artifact
diff --git a/.github/workflows/tg-model-perf-tests-impl.yaml b/.github/workflows/tg-model-perf-tests-impl.yaml
index 2ba22cfc0c8..5ce68339f04 100644
--- a/.github/workflows/tg-model-perf-tests-impl.yaml
+++ b/.github/workflows/tg-model-perf-tests-impl.yaml
@@ -51,9 +51,9 @@ jobs:
       - name: Download profiler build artifact
         uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+          name: TTMetal_build_any_profiler
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run model perf regression tests
         timeout-minutes: 60
diff --git a/.github/workflows/tg-model-perf-tests.yaml b/.github/workflows/tg-model-perf-tests.yaml
index 4202cc46ad3..a8bb64dff46 100644
--- a/.github/workflows/tg-model-perf-tests.yaml
+++ b/.github/workflows/tg-model-perf-tests.yaml
@@ -9,7 +9,6 @@ jobs:
   build-artifact-profiler:
     uses: ./.github/workflows/build-artifact.yaml
     with:
-      arch: '["wormhole_b0"]'
       tracy: true
     secrets: inherit
   tg-model-perf-tests:
diff --git a/.github/workflows/tg-nightly-tests.yaml b/.github/workflows/tg-nightly-tests.yaml
index bee91a86e0b..ce8f9897ffb 100644
--- a/.github/workflows/tg-nightly-tests.yaml
+++ b/.github/workflows/tg-nightly-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   tg-nightly-tests:
     needs: build-artifact
@@ -39,9 +37,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run demo regression tests
         shell: bash {0}
diff --git a/.github/workflows/tg-unit-tests-impl.yaml b/.github/workflows/tg-unit-tests-impl.yaml
index 500717f87d0..a3d3b109d53 100644
--- a/.github/workflows/tg-unit-tests-impl.yaml
+++ b/.github/workflows/tg-unit-tests-impl.yaml
@@ -65,9 +65,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run unit regression tests
         timeout-minutes: ${{ matrix.test-group.timeout }}
diff --git a/.github/workflows/tg-unit-tests.yaml b/.github/workflows/tg-unit-tests.yaml
index a28497faedb..dfa3483896b 100644
--- a/.github/workflows/tg-unit-tests.yaml
+++ b/.github/workflows/tg-unit-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   TG-Unit-tests:
     needs: build-artifact
diff --git a/.github/workflows/tgg-demo-tests.yaml b/.github/workflows/tgg-demo-tests.yaml
index 13f9fc3b8c5..0cab3fdd13d 100644
--- a/.github/workflows/tgg-demo-tests.yaml
+++ b/.github/workflows/tgg-demo-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   tgg-demo-tests:
     needs: build-artifact
@@ -38,9 +36,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run demo regression tests
         timeout-minutes: 180
diff --git a/.github/workflows/tgg-frequent-tests-impl.yaml b/.github/workflows/tgg-frequent-tests-impl.yaml
index af54e8e89be..b042635fece 100644
--- a/.github/workflows/tgg-frequent-tests-impl.yaml
+++ b/.github/workflows/tgg-frequent-tests-impl.yaml
@@ -30,9 +30,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent regression tests
         timeout-minutes: 90
diff --git a/.github/workflows/tgg-frequent-tests.yaml b/.github/workflows/tgg-frequent-tests.yaml
index 36355e3a27b..4c15f1c7209 100644
--- a/.github/workflows/tgg-frequent-tests.yaml
+++ b/.github/workflows/tgg-frequent-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   tgg-frequent-tests:
     needs: build-artifact
diff --git a/.github/workflows/tgg-model-perf-tests-impl.yaml b/.github/workflows/tgg-model-perf-tests-impl.yaml
index c79b84b8e01..c487d43d7e3 100644
--- a/.github/workflows/tgg-model-perf-tests-impl.yaml
+++ b/.github/workflows/tgg-model-perf-tests-impl.yaml
@@ -43,9 +43,9 @@ jobs:
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run model perf regression tests
         timeout-minutes: 60
diff --git a/.github/workflows/tgg-model-perf-tests.yaml b/.github/workflows/tgg-model-perf-tests.yaml
index c65fc7408d6..6b76f5ab177 100644
--- a/.github/workflows/tgg-model-perf-tests.yaml
+++ b/.github/workflows/tgg-model-perf-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   tgg-model-perf-tests:
     needs: build-artifact
diff --git a/.github/workflows/tgg-unit-tests-impl.yaml b/.github/workflows/tgg-unit-tests-impl.yaml
index 12d03f7686b..5313e0610c4 100644
--- a/.github/workflows/tgg-unit-tests-impl.yaml
+++ b/.github/workflows/tgg-unit-tests-impl.yaml
@@ -30,9 +30,9 @@ jobs:
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_any
       - name: Extract files
-        run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+        run: tar -xvf ttm_any.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run unit regression tests
         timeout-minutes: 60
diff --git a/.github/workflows/tgg-unit-tests.yaml b/.github/workflows/tgg-unit-tests.yaml
index f9be79c02f2..6c42ff61f4f 100644
--- a/.github/workflows/tgg-unit-tests.yaml
+++ b/.github/workflows/tgg-unit-tests.yaml
@@ -8,8 +8,6 @@ on:
 jobs:
   build-artifact:
     uses: ./.github/workflows/build-artifact.yaml
-    with:
-      arch: '["wormhole_b0"]'
     secrets: inherit
   TGG-tests:
     needs: build-artifact
diff --git a/.github/workflows/tt-train-post-commit.yaml b/.github/workflows/tt-train-post-commit.yaml
index 7a8f3971f1e..1ecdcabfd17 100644
--- a/.github/workflows/tt-train-post-commit.yaml
+++ b/.github/workflows/tt-train-post-commit.yaml
@@ -59,8 +59,6 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: ${{ inputs.arch }}
       - name: ${{ matrix.test-group.name }} tests
         timeout-minutes: ${{ inputs.timeout }}
         run: |
diff --git a/.github/workflows/ttnn-post-commit-wrapper.yaml b/.github/workflows/ttnn-post-commit-wrapper.yaml
index 0f6f1f4a56f..324f6582f5d 100644
--- a/.github/workflows/ttnn-post-commit-wrapper.yaml
+++ b/.github/workflows/ttnn-post-commit-wrapper.yaml
@@ -18,11 +18,9 @@ jobs:
         # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
         # The full 22.04 flow can be tested without precompiled
         os: [ubuntu-20.04]
-        arch: [grayskull, wormhole_b0]
     uses: ./.github/workflows/_build-wheels-impl.yaml
     with:
       os: ${{ matrix.os }}
-      arch: ${{ matrix.arch }}
       from-precompiled: true
     secrets: inherit
   ttnn-unit-tests:
diff --git a/.github/workflows/ttnn-post-commit.yaml b/.github/workflows/ttnn-post-commit.yaml
index 3f4a7601bfb..15642748dcc 100644
--- a/.github/workflows/ttnn-post-commit.yaml
+++ b/.github/workflows/ttnn-post-commit.yaml
@@ -79,7 +79,7 @@ jobs:
       - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
       - uses: actions/download-artifact@v4
         with:
-          name: eager-dist-${{ matrix.os }}-${{ inputs.arch }}
+          name: eager-dist-${{ matrix.os }}-any
       - name: Set ttnn fast runtime if exists in config
         if: ${{ matrix.test-group.fast_runtime_mode_off }}
         run: |
@@ -90,6 +90,8 @@ jobs:
         with:
           docker_username: ${{ github.actor }}
           docker_password: ${{ secrets.GITHUB_TOKEN }}
+          docker_opts: |
+            -e ARCH_NAME=${{ inputs.arch }}
           run_args: |
             WHEEL_FILENAME=$(ls -1 *.whl)
             pip3 install --user $WHEEL_FILENAME
diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
index 70354311c6c..8b511dee0a3 100644
--- a/.github/workflows/ttnn-run-sweeps.yaml
+++ b/.github/workflows/ttnn-run-sweeps.yaml
@@ -47,6 +47,7 @@ on:
           - eltwise.unary.sin.sin
           - eltwise.unary.sin.sin_pytorch2
           - eltwise.unary.sin.sin_forge
+          - eltwise.unary.sin.sin_sharded
           - eltwise.unary.tril.tril_pytorch2
           - eltwise.unary.clamp.clamp
           - eltwise.unary.clamp.clamp_forge
@@ -74,8 +75,11 @@ on:
           - eltwise.unary.clone.clone
           - eltwise.unary.elu.elu
           - eltwise.unary.elu.elu_pytorch2
+          - eltwise.unary.elu.elu_sharded
           - eltwise.unary.erfc.erfc
           - eltwise.unary.erfc.erfc_sharded
+          - eltwise.unary.eqz.eqz
+          - eltwise.unary.eqz.eqz_sharded
           - eltwise.unary.exp.exp
           - eltwise.unary.exp.exp_sharded
           - eltwise.unary.exp.exp_forge
@@ -133,9 +137,12 @@ on:
           - eltwise.unary.neg.neg_pytorch2
           - eltwise.unary.neg.neg_forge
           - eltwise.unary.erf.erf
+          - eltwise.unary.erf.erf_sharded
           - eltwise.unary.erfinv.erfinv
           - eltwise.unary.erfinv.erfinv_sharded
           - eltwise.unary.i0.i0
+          - eltwise.unary.reciprocal.reciprocal
+          - eltwise.unary.reciprocal.reciprocal_sharded
           - eltwise.unary.silu.silu
           - eltwise.unary.silu.silu_pytorch2
           - eltwise.unary.glu.glu
@@ -191,9 +198,11 @@ on:
           - eltwise.unary_backward.hardshrink_bw
           - eltwise.unary_backward.softshrink_bw
           - eltwise.unary_backward.acos_bw.acos_bw
+          - eltwise.unary_backward.acos_bw.acos_bw_sharded
           - eltwise.unary_backward.acosh_bw.acosh_bw
           - eltwise.unary_backward.atan_bw.atan_bw
           - eltwise.unary_backward.cos_bw.cos_bw
+          - eltwise.unary_backward.cos_bw.cos_bw_sharded
           - eltwise.unary_backward.frac_bw.frac_bw
           - eltwise.unary_backward.i0_bw.i0_bw
           - eltwise.unary_backward.rad2deg_bw.rad2deg_bw
@@ -226,9 +235,12 @@ on:
           - eltwise.unary_backward.tanh_bw.tanh_bw
           - eltwise.unary_backward.sqrt_bw.sqrt_bw
           - eltwise.unary_backward.add_bw.add_bw
+          - eltwise.unary_backward.add_bw.add_bw_sharded
           - eltwise.unary_backward.assign_bw.assign_bw
           - eltwise.unary_backward.fill_bw.fill_bw
+          - eltwise.unary_backward.fill_bw.fill_bw_sharded
           - eltwise.unary_backward.hardsigmoid_bw.hardsigmoid_bw
+          - eltwise.unary_backward.hardsigmoid_bw.hardsigmoid_bw_sharded
           - eltwise.unary_backward.lgamma_bw.lgamma_bw
           - eltwise.unary_backward.multigammaln_bw.multigammaln_bw
           - eltwise.unary_backward.leaky_relu_bw.leaky_relu_bw
@@ -251,6 +263,8 @@ on:
           - eltwise.binary_complex.add_bw.add_bw
           - eltwise.binary_complex.sub_bw.sub_bw
           - eltwise.binary_complex.mul_bw.mul_bw
+          - eltwise.unary.digamma.digamma
+          - eltwise.unary.digamma.digamma_sharded
           - eltwise.unary.lgamma.lgamma
           - eltwise.unary.lgamma.lgamma_sharded
           - eltwise.unary.logit.logit
@@ -278,12 +292,14 @@ on:
           - eltwise.unary.ltz.ltz
           - eltwise.unary.gez.gez
           - eltwise.unary.lez.lez
+          - eltwise.unary.lez.lez_sharded
           - eltwise.unary.nez.nez
           - eltwise.unary.prelu.prelu
           - eltwise.unary.prelu.prelu_sharded
           - eltwise.unary.hardswish.hardswish_pytorch2
           - eltwise.unary.hardtanh.hardtanh_pytorch2
           - eltwise.unary.leaky_relu.leaky_relu
+          - eltwise.unary.leaky_relu.leaky_relu_sharded
           - eltwise.unary.reglu.reglu
           - eltwise.unary.round.round_sharded
           - eltwise.unary_complex.polar.polar
@@ -494,8 +510,6 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: wormhole_b0
       - name: Run ttnn sweeps generation (single sweep)
         if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.sweep_name != 'ALL SWEEPS (Nightly)' }}
         run: |
@@ -554,8 +568,6 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
       - uses: ./.github/actions/prepare-metal-run
-        with:
-          arch: ${{ matrix.test-group.arch }}
       - name: Run ttnn sweeps (single sweep)
         if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.sweep_name != 'ALL SWEEPS (Nightly)' }}
         run: |
diff --git a/Doxyfile b/Doxyfile
index 0714e2f5fff..eaeb342c93c 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -922,9 +922,9 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = tt_metal/hw/inc/dataflow_api.h \
+INPUT                  = tt_metal/api/tt-metalium/dataflow_api.h \
                          tt_metal/hw/inc/ethernet/dataflow_api.h \
-                         tt_metal/host_api.hpp \
+                         tt_metal/api/tt-metalium/host_api.hpp \
                          tt_metal/include/compute_kernel_api/eltwise_unary/erf_erfc.h \
                          tt_metal/include/compute_kernel_api/eltwise_unary/erfinv.h \
                          tt_metal/include/compute_kernel_api/eltwise_unary/exp.h \
@@ -953,12 +953,12 @@ INPUT                  = tt_metal/hw/inc/dataflow_api.h \
                          tt_metal/include/compute_kernel_api.h \
                          tt_metal/impl/kernels/kernel_args.hpp \
                          tt_metal/include/tt_metal/metal.hpp \
-                         tt_metal/include/tt_metal/types.hpp \
+                         tt_metal/api/tt-metalium/types.hpp \
                          tt_metal/include/tt_metal/buffer.hpp \
                          tt_metal/include/tt_metal/command_queue.hpp \
                          tt_metal/include/tt_metal/device.hpp \
                          tt_metal/include/tt_metal/event.hpp \
-                         tt_metal/include/tt_metal/global_circular_buffer.hpp \
+                         tt_metal/api/tt-metalium/global_circular_buffer.hpp \
                          tt_metal/include/tt_metal/kernel.hpp \
                          tt_metal/include/tt_metal/program.hpp \
                          tt_metal/include/tt_metal/trace.hpp
diff --git a/build_metal.sh b/build_metal.sh
index 7ff69bc1c2d..3e47dd263a3 100755
--- a/build_metal.sh
+++ b/build_metal.sh
@@ -240,7 +240,6 @@ fi
 if [ "$build_tests" = "ON" ]; then
     cmake_args+=("-DTT_METAL_BUILD_TESTS=ON")
     cmake_args+=("-DTTNN_BUILD_TESTS=ON")
-    cmake_args+=("-DTT_UMD_BUILD_TESTS=ON")
 fi
 
 if [ "$build_metal_tests" = "ON" ]; then
@@ -276,7 +275,6 @@ fi
 if [ "$build_all" = "ON" ]; then
     cmake_args+=("-DTT_METAL_BUILD_TESTS=ON")
     cmake_args+=("-DTTNN_BUILD_TESTS=ON")
-    cmake_args+=("-DTT_UMD_BUILD_TESTS=ON")
     cmake_args+=("-DBUILD_PROGRAMMING_EXAMPLES=ON")
     cmake_args+=("-DBUILD_TT_TRAIN=ON")
 fi
diff --git a/cmake/helper_functions.cmake b/cmake/helper_functions.cmake
index ca0cdbbbcee..60bc56372b6 100644
--- a/cmake/helper_functions.cmake
+++ b/cmake/helper_functions.cmake
@@ -53,14 +53,7 @@ function(CREATE_PGM_EXAMPLES_EXE TESTLIST SUBDIR)
                 m
                 pthread
         )
-        target_include_directories(
-            ${TEST_TARGET}
-            PRIVATE
-                ${PROJECT_SOURCE_DIR}
-                ${PROJECT_SOURCE_DIR}/tt_metal
-                ${PROJECT_SOURCE_DIR}/tt_metal/common
-                ${CMAKE_CURRENT_SOURCE_DIR}
-        )
+        target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
         set_target_properties(
             ${TEST_TARGET}
             PROPERTIES
diff --git a/dependencies/CMakeLists.txt b/dependencies/CMakeLists.txt
index 3064d846fe5..f7f9d6c065f 100644
--- a/dependencies/CMakeLists.txt
+++ b/dependencies/CMakeLists.txt
@@ -111,3 +111,24 @@ CPMAddPackage(
     OPTIONS
         "XTENSOR_ENABLE_TESTS OFF"
 )
+
+############################################################################################################################
+# benchmark : https://github.com/google/benchmark
+############################################################################################################################
+
+CPMAddPackage(NAME benchmark GITHUB_REPOSITORY google/benchmark GIT_TAG v1.9.1)
+
+if(benchmark_ADDED)
+    set_target_properties(
+        benchmark
+        PROPERTIES
+            LIBRARY_OUTPUT_DIRECTORY
+                "${CMAKE_BINARY_DIR}/lib"
+    )
+endif()
+
+# TODO(afuller): Move this to CPM and use upstream's CMake file, AFTER we move to Ubuntu 22.04 and drop 20.04 and bump
+#                our minimum CMake version accordingly.  Taskflow's CMake wants v3.18+
+add_library(Taskflow INTERFACE)
+add_library(Taskflow::Taskflow ALIAS Taskflow)
+target_include_directories(Taskflow SYSTEM INTERFACE ${PROJECT_SOURCE_DIR}/tt_metal/third_party/taskflow)
diff --git a/models/utility_functions.py b/models/utility_functions.py
index f13fd48d8ca..88dada95370 100644
--- a/models/utility_functions.py
+++ b/models/utility_functions.py
@@ -61,6 +61,27 @@ def torch_random(shape, low, high, dtype):
     return torch.zeros(shape, dtype=dtype).uniform_(low, high)
 
 
+def torch_random_with_zeros(shape, low, high, dtype, zero_fraction=0.1):
+    total_elements = torch.prod(torch.tensor(shape)).item()
+    num_zeros = int(total_elements * zero_fraction)
+    num_random = total_elements - num_zeros
+
+    # Generate random values between low and high
+    random_values = torch.empty(num_random).uniform_(low, high)
+    zeros = torch.zeros(num_zeros)
+
+    # Combine zeros and random values
+    combined = torch.cat([zeros, random_values])
+
+    # Shuffle the tensor
+    shuffled = combined[torch.randperm(combined.size(0))]
+
+    # Reshape to the desired shape
+    result_tensor = shuffled.view(shape)
+    result_tensor.to(dtype)
+    return result_tensor
+
+
 ### Profiling ###
 class Profiler:
     def __init__(self):
diff --git a/setup.py b/setup.py
index 6cd62f16410..b8ab90ff22f 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,7 @@ def get_is_srcdir_build():
 
 
 def get_arch_name():
-    return attempt_get_env_var("ARCH_NAME")
+    return "any"
 
 
 def get_metal_local_version_scheme(metal_build_config, version):
diff --git a/tech_reports/LLMs/llms.md b/tech_reports/LLMs/llms.md
index 3cd4da7eba5..1ae7f25d0b6 100644
--- a/tech_reports/LLMs/llms.md
+++ b/tech_reports/LLMs/llms.md
@@ -77,7 +77,7 @@ k_heads = ttnn.experimental.rotary_embedding_llama(
 )
 ```
 
-#### Setting up inputs to RoPE
+#### 2.2.1 Setting up inputs to RoPE
 
 Fused operation uses a different parallelization scheme internally depending on if the model is in *prefill* or *decode* mode. The following table describes various shapes and memory configurations for *prefill* and *decode* modes:
 
@@ -90,7 +90,7 @@ Fused operation uses a different parallelization scheme internally depending on
 *Note: (TH, TW) = (TILE_HEIGHT, TILE_WIDTH)*
 
 
-#### Decode mode specifics
+#### 2.2.2 Decode mode specifics
 The cos/sin matrices, are generated in two slightly different ways, depending on the mode of operation. For *prefill* mode, the cos/sin matrices are computed once at intialization using the *prefill* sequence length, and then passed into the RoPE OP. However, in *decode* mode, since the position index of each user is updated from token-to-token, the cos/sin matrices must be updated across iterations. Here, we leverage our `TtLlamaRotarySetup` module, that can be used at each decode iteration to get the corresponding cos/sin matrices.
 
 The following code sample shows how `TtLlamaRotarySetup` can be used in decode mode:
@@ -139,7 +139,7 @@ out = ttnn.experimental.rotary_embedding_llama(
 
 Normalization is a critical operation in Large Language Models (LLMs), ensuring stable training and efficient inference. Two widely adopted normalization techniques in modern LLMs, **LayerNorm** and **RMSNorm**, are fully supported in TT-NN.
 
-#### Implementations of Normalization Operations
+#### 2.3.1 Implementations of Normalization Operations
 
 TT-NN includes two primary implementations of normalization operations to handle diverse activation layouts efficiently:
 
@@ -147,7 +147,7 @@ TT-NN includes two primary implementations of normalization operations to handle
 2. **Distributed Norm**
 
 
-#### 1. Non-Distributed Norm
+#### 2.3.1.1 Non-Distributed Norm
 
 **Non-Distributed Norm** refers to the standard implementation of normalization operations applied to activations that are not distributed across multiple devices. This type of normalization is suitable for setups where the entire activation or embedding is available locally on a single device or is replicated identically across multiple devices in a data-parallel setup.  This implementation supports both sharded and interleaved inputs.
 
@@ -205,14 +205,11 @@ ttnn_gamma_rm = ttnn.as_tensor(
 )
 ```
 
-
-
-
-#### 2. Distributed Norm
+#### 2.3.1.2 Distributed Norm
 
 The distributed implementation is designed for cases where activations are **sharded along the embedding dimension** across multiple devices. It ensures the correct computation of mean and variance across shards by leveraging cross-device communication. Both interleaved and width-sharded inputs are supported.
 
-#### Steps to Perform Distributed Normalization on TT-Devices
+##### 2.3.2.2.1 Steps to Perform Distributed Normalization on TT-Devices
 
 1. **Compute Local Statistics** - Each device computes the required statistics (e.g., \(E[x]\), \(E[x^2]\)) locally on its shard of the input tensor.
    - For **RMSNorm**, only \(E[x^2]\) is required.
@@ -258,7 +255,6 @@ The distributed implementation is designed for cases where activations are **sha
    ```
    - **Output**: A tensor of shape `[1, 1, batch, embedding_dim // num_devices]`.
 
-
 > [!NOTE]
 > The following inputs are valid for both implementations.
 > - **Interleaved Inputs**:
@@ -268,8 +264,7 @@ The distributed implementation is designed for cases where activations are **sha
   For width-sharded inputs, the kernel splits the work across the embedding dimension.
   This design is more **optimal for decode cases**, where the sequence length is typically `seq_len=1`.
 
-
-#### References
+#### 2.3.1.3 References
 - Non-Distributed Norm Op Code [[1]](https://github.com/tenstorrent/tt-metal/tree/main/ttnn/cpp/ttnn/operations/normalization/layernorm) [[2]](https://github.com/tenstorrent/tt-metal/tree/main/ttnn/cpp/ttnn/operations/normalization/rmsnorm)
 - Distributed Norm Op Code [[3]](https://github.com/tenstorrent/tt-metal/tree/main/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed) [[4]](https://github.com/tenstorrent/tt-metal/tree/main/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed)
 - Non-Distributed Norms Unit Tests [[5]](https://github.com/tenstorrent/tt-metal/blob/main/tests/tt_eager/python_api_testing/unit_testing/misc/test_layernorm_sharded.py) [[6]](https://github.com/tenstorrent/tt-metal/blob/main/tests/tt_eager/python_api_testing/unit_testing/misc/test_layernorm.py)
@@ -345,7 +340,7 @@ Common Terminology:
 | bsz | Batch Size |
 | batch_id | Batch Index (used for prefill) |
 | cur_pos/cur_pos_tensor | List/tensor of current positions in the sequence for each batch. |
-| cache_len | Length of the KV Cache |
+| cache_len | Length of the KV Cache. |
 | seqlen | Sequence Length |
 | dim | Hidden dimension of input x. |
 | head_dim | Hidden dimension of Q, K, V. |
@@ -430,38 +425,39 @@ The attention module in decode mode expects input shape `(1, seqlen=1, bsz, hidd
 
 An end-to-end example of the decode attention module is in the `models/demos/llama3/tt/llama_attention.py` file, under the `forward_decode` method. The decode mode is broken down into the following steps:
 
-1. QKV projections matmuls.
+1. **QKV Projections Matmuls**
    - This works the same as in prefill mode, using `ttnn.linear`. Note that the input shape is `(1, 1, bsz, dim)` instead of `(1, 1, seqlen, dim)`.
    - Input/Output shapes:
       ```python
       (1, 1, bsz, dim) -> (1, 1, bsz, (n_q_heads+2*n_kv_heads)*head_dim)
       ```
 
-2. Reshape Q, K, V to match the expected input shape for scaled dot product attention.
+2. **Reshape QKV**
+   - Reshape Q, K, and V to match the expected input shape for scaled dot product attention.
    - We split the fused QKV tensor into individual Q, K, V tensors using `ttnn.experimental.nlp_create_qkv_heads_decode`.
 > [!NOTE]
 >  This is a different OP than `ttnn.experimental.nlp_create_qkv_heads` used in prefill mode. For example:
->
-     ```python
-     Q, K, V = ttnn.experimental.nlp_create_qkv_heads_decode(
-      xqkv_fused,
-      num_heads=n_q_heads,
-      num_kv_heads=n_kv_heads,
-      memory_config=ttnn.MemoryConfig(
-          ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1
-      )
-     )
-     ```
+> ```python
+> Q, K, V = ttnn.experimental.nlp_create_qkv_heads_decode(
+>  xqkv_fused,
+>  num_heads=n_q_heads,
+>  num_kv_heads=n_kv_heads,
+>  memory_config=ttnn.MemoryConfig(
+>     ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1
+>  )
+> )
+> ```
+
    - **Input/Output Shapes**: The output is height sharded across the batch dimension on `bsz` number of cores.
       ```python
       (1, 1, bsz, (n_q_heads+2*n_kv_heads)*head_dim) -> (1, bsz, n_q_heads, head_dim), (1, bsz, n_kv_heads, head_dim), (1, bsz, n_kv_heads, head_dim)
       ```
 
-3. Apply RoPE to Q and K
-   - Again, apply the RoPE transformation to Q and K using the rotary embedding op outlined in [2.2 RoPE](#22-rope). The input/output shapes remain the same as in step 2.
+3. **Apply RoPE to Q and K**
+   - Again, apply the RoPE transformation to Q and K using the rotary embedding OP outlined in [2.2 RoPE](#22-rope). The input/output shapes remain the same as in step 2.
 
-4. Cache K and V
-   - Populate the KV cache at `cur_pos` for all batches with the current K and V tensors using the `ttnn.experimental.paged_update_cache` op. This OP takes in an optional `page_table` argument to support paged KV cache updates. Example:
+4. **Cache K and V**
+   - Populate the KV cache at `cur_pos` for all batches with the current K and V tensors using the `ttnn.experimental.paged_update_cache` OP. This OP takes in an optional `page_table` argument to support paged KV cache updates. Example:
      ```python
      ttnn.experimental.paged_update_cache(keys, K, update_idxs=cur_pos, page_table=page_table)
      ttnn.experimental.paged_update_cache(values, V, update_idxs=cur_pos, page_table=page_table)
@@ -471,13 +467,13 @@ An end-to-end example of the decode attention module is in the `models/demos/lla
      ttnn.experimental.paged_update_cache(keys, K, update_idxs_tensor=cur_pos_tensor, page_table=page_table)
      ```
 
-5. Scaled Dot Product Attention Decode
+5. **Scaled Dot Product Attention Decode**
    - Perform scaled dot product attention using custom flash attention kernel optimized for decode mode, `ttnn.transformer.scaled_dot_product_attention_decode` and `ttnn.transformer.paged_scaled_dot_product_attention_decode` for paged KV cache.
    - `ttnn.transformer.scaled_dot_product_attention_decode` considers the following arguments:
      - `q`: Query tensor of shape `(1, bsz, n_q_heads, head_dim)`.
      - `k`: Key tensor of shape `(1, bsz, cache_len, head_dim)`.
      - `v`: Value tensor of shape `(1, bsz, cache_len, head_dim)`.
-     - `is_causal`: bool, defaults to `true`. Whether to apply causal masking.
+     - `is_causal`: Bool, defaults to `true`. Whether to apply causal masking.
      - `attn_mask`: Optional attention mask tensor. Defaults to `None` and only used if `is_causal=False`.
      - `cur_pos`: (Required for is_causal=True) List of current positions in the sequence for each batch. Defaults to `None`. Must be provided if `cur_pos_tensor` is not provided.
      - `cur_pos_tensor`: (Required for is_causal=True) Optional current position tensor. Defaults to `None`. Must be provided if `cur_pos` is not provided.
@@ -495,7 +491,7 @@ An end-to-end example of the decode attention module is in the `models/demos/lla
      attn_output = ttnn.transformer.paged_scaled_dot_product_at tention_decode(Q, K, V, attn_mask=mask, is_causal=False)
      ```
 
-6. Output Reshape and Output Matmul
+6. **Output Reshape and Output Matmul**
    - Finally, use `ttnn.experimental.nlp_concat_heads_decode` to reshape the output of the attention OP, followed by a standard `ttnn.linear` to do the output projection. For example:
      ```python
      attn_output = ttnn.experimental.nlp_concat_heads_decode(attn_output, num_heads=n_q_heads)
@@ -511,8 +507,7 @@ Flash attention and flash decode are the major OPs for attention. They are optim
 
 Here are some useful details regarding attention OPs for efficient and bug-free code writing:
 
-1. **Program Configs** in flash attention (and flash decode) OPs:
-   The Program config has the following parameters:
+1. Program Configs in flash attention (and flash decode) OPs. The Program config has the following parameters:
    - `compute_with_storage_grid_size`: The grid size.
    - `q_chunk_size`: The size of a chunk to process at a time for Q.
    - `k_chunk_size`: The size of a chunk to process at a time for K and V.
@@ -525,9 +520,9 @@ Flash decode processes the entire Q (since query in decode mode is small) and K/
 
 Finally, the `exp_approx_mode` field is to set the exponential approximation mode for softmax in flash attention and flash decode. We recommend setting this to `true` for small `seqlen/chunk_size` values. For large `seqlen/chunk_size` values, the error introduced by the exponential approximation can accumulate through chunk accumulation, causing major degradation in pcc. For example in Llama3 models, we use `q_chunk_size` and `k_chunk_size` of 512, and `exp_approx_mode` set to `false` for long sequence lengths greater than 16K.
 
-2. **Current Position Tensor** for flash decode and kv cache OPs:
+2. Current Position Tensor for flash decode and kv cache OPs:
 
-In decode mode provide a list of current positions or a tensor. The tensor version can be more efficient because it supports **tracing**. For more information about tracing, see: [4.1 Tracing](#41-tracing). Tracing requires the traced variables to be statically known at the compile time. If you provide a list of current positions, you cannot modify it for the next token generation. However, if you provide a tensor, the position values are stored in device memory and can be updated using binary addition op, e.g. `ttnn.add`.
+In decode mode provide a list of current positions or a tensor. The tensor version can be more efficient because it supports tracing**. For more information about tracing, see: [4.1 Tracing](#41-tracing). Tracing requires the traced variables to be statically known at the compile time. If you provide a list of current positions, you cannot modify it for the next token generation. However, if you provide a tensor, the position values are stored in device memory and can be updated using binary addition op, e.g. `ttnn.add`.
 
 ### 2.5 MLP
 
@@ -544,7 +539,7 @@ y = FF2(w2_in)
 
 Let's dive into our implementation of MLP, and discuss what makes it performant across different WH systems.
 
-#### 1. Setup
+#### 2.5.1 Setup
 When used in the model by the `TtLlamaDecoder` module class, the MLP class is initialized at the start, where the weights for `w1`, `w2`, and `w3` are loaded and fractured across devices in specific schemes, as outlined in the [Multi-Device](#33-multi-device) section. Specifically, in n300 and T3000 systems the weights are 1D column fractured, and in TG systems the weights are 2D fractured.
 
 ```py
@@ -559,7 +554,7 @@ self.feed_forward = TtLlamaMLP(
 )
 ```
 
-#### 2. Inputs
+#### 2.5.2 Inputs
 At runtime, the `forward` function of `TtLlamaMLP` is called with either *'prefill'* or *'decode'* mode, with inputs replicated across devices, for all WH system configurations.
 > [!NOTE]
 > In the actual model, the input `ff_in` is the output of the `norm` step prior to MLP.
@@ -597,9 +592,7 @@ ff_in_memory_config = ttnn.DRAM_MEMORY_CONFIG
 >     ff_in = ttnn.reshape(ff_in, [1, seq_len // 1024, 1024, -1])
 > ```
 
-
-
-#### 2. Setting Up Program Configurations For Matmuls
+#### 2.5.3 Setting Up Program Configurations For Matmuls
 Depending on the mode of operation, the `forward` function of `TtLlamaMLP` instantiates different program configs for matmuls of FF1/FF3, and FF2.
 
 **Decode mode**
@@ -661,7 +654,6 @@ def matmul_config(
         fuse_batch=fuse_batch,
     )
 
-
 _, _, m, k = ff_in.shape
 n = hidden_dim // num_devices
 pc1 = matmul_config(
@@ -674,8 +666,7 @@ pc1 = matmul_config(
 )
 ```
 
-
-#### 3. FF1/FF3 Matmul
+#### 2.5.4 FF1/FF3 Matmul
 The first set of operations in the MLP are:
 ```py
 w1_out = FF1(x)
@@ -712,7 +703,7 @@ w3_out = ttnn.linear(
 )
 ```
 
-#### 3.1 FF1/FF3 Matmul With 2D Weight Fracturing
+#### 2.5.5 FF1/FF3 Matmul With 2D Weight Fracturing
 
 In the case of TG systems, where we have access to a 2D device mesh, we can leverage 2D weight fracturing. For a weight tensor with shape `[1, 1, K, N]`, using 2D weight fracturing on a `(8, 4)` device mesh, the resulting shape on each device would be: `[1, 1, K / 4, N / 8]`. In other words, the inner dimension (K) of the matmul is spread out across four devices, and to complete the entire matmul operation, a reduction step across the partials is necessary. We do this using an all-reduce operation along the four devices in `cluster_axis=1` of the device mesh.
 ```py
@@ -734,7 +725,7 @@ In the case of TG systems, where we have access to a 2D device mesh, we can leve
   )
 ```
 
-#### 4. Multiply + Fused SiLU Activation
+#### 2.5.6 Multiply + Fused SiLU Activation
 
 The output of the FF1/FF3 matmuls are column fractured tensors (the extra all-reduce operation for TG systems ensures this). The next operation is:
 ```py
@@ -757,7 +748,7 @@ w2_in = ttnn.multiply(
 
 Following our pattern mentioned before, the outputs are L1 sharded in `decode` mode and DRAM interleaved in `prefill` mode.
 
-#### 5. FF2 Matmul
+#### 2.5.7 FF2 Matmul
 The last computation in MLP is:
 ```py
 y = FF2(w2_in)
@@ -784,7 +775,7 @@ if seq_len >= 1024:  # Reshape back to intended shape
     w2_out = ttnn.reshape(w2_out, [1, 1, seq_len, -1])
 ```
 
-###### 5.1 Accumulating the partial outputs of FF2
+###### 2.5.7.1 Accumulating the partial outputs of FF2
 
 Since the output of FF2 is the correct shape but only a partial on each device, the output of the MLP module is required to be fractured where each device has fully accumulated the inner dim of the matmul, but only has a fraction of the outer dim. There are two different ways to handle this, depending on if the WH system has a 1D or 2D device mesh.
 
@@ -812,14 +803,12 @@ Since the output of FF2 is the correct shape but only a partial on each device,
     ```
 
 ### 2.6 Decoder
-<div align="center">
-<img src="images/2.6-decoder.png" alt="Decoder Diagram" title="Decoder Title" width="350" height="400">
-</div>
 When the components explained in previous sections (MLP, Attention, RMSNorm) are implemented, bringing up the decoder is relatively straightforward. According to the diagram (based on the Llama3.1 example), the components are stacked sequentially during the forward pass. The only consideration is whether addition of MLP and Attention outputs should be stored in L1 or in DRAM.
 
 <br>
 
-The Decode forward pass implementation below follows the diagram above. To optimize memory usage, we recommend you deallocate tensors after usage, which is crucial under tighter memory constraints.
+The Decode forward pass implementation below follows the accompanying diagram. To optimize memory usage, we recommend you deallocate tensors after usage, which is crucial under tighter memory constraints.
+
 <br>
 
 To optimize performance in decode mode, we maintain the residual stream in L1 and shard it across cores and devices. However, determining the optimal number of cores for sharding can be challenging, especially for operations like DRAM-sharded matmuls. Here is the [code](https://github.com/tenstorrent/tt-metal/blob/53c32c0c0da926f97bd0eb042e70fd54c2866f44/models/demos/llama3/tt/model_config.py#L931) in Llama model config, that produces the core grid that will divide the N and K dims of a matmul evenly.
@@ -828,6 +817,12 @@ In our implementation of Llama3.1, there are some OPs that require interleaved t
 
 <br>
 
+<div align="center">
+<img src="images/2.6-decoder.png" alt="Decoder Diagram" title="Decoder Title" width="350" height="400">
+</div>
+
+<br>
+
 ```py
 def forward(
         self,
@@ -878,7 +873,7 @@ As a result, the `LMHead` has a large `last_dim` in its weight matrix. Given the
 
 The number of iterations required depends on the size of the weights and the number of devices available, ranging from 1 to several iterations. For example, in Llama 3.1’s decode mode, the LMHead matrix multiplication involves shapes of ```(32, 8K) x (8K, 128K)```.
 
-Below is an illustration of how the LMHead weights are partitioned across two devices, followed by its implementation. For ilustrative purposes it uses 128K for the `vocab_size` instead of the real Llama3.1 value of `128256`.
+Below is an illustration of how the LMHead weights are partitioned across two devices, followed by its implementation. For illustrative purposes it uses 128K for the `vocab_size` instead of the real Llama3.1 value of `128256`.
 
 <div align="center">
 <img src="images/2.7-lm-head.png" alt="LM Head Diagram" title="LM_Head" width="650" height="350">
@@ -929,7 +924,7 @@ for i, split_size in enumerate(split_sizes):
 
 We use DRAM-sharded matmul for LMHead with `program_config` and `memory_config` generated by the code below.
 For more information check [Section: Op Configs](#44-op-configs).
-The primary reason for having multiple `program_configs` is that the weight shapes may result in unequal split sizes. This variability means the same configuration cannot be used for every matrix multiplication.
+The primary reason for having multiple `program_configs` is that the weight shapes may result in unequal split sizes. This variability means the same configuration cannot be used for every matmul.
 
 ```py
 # Generate dram-sharded memory_config
@@ -950,7 +945,7 @@ self.program_configs = [
 Once weights are pushed to the devices and the decoders are executed, the `LMHead` forward pass needs to be executed in iterations.
 The code below shows that after each iteration outputs are converted from sharded to interleaved tensors. Once all iterations are completed, the final output is produced by concatenation over the last dim and returned as `output`.
 
-When executing the model, it is essential to ensure that the output of the last decoder is already replicated across tensors. Since this replication is enforced earlier, no additional code is required in the `LMHead` forward pass to handle it.
+When executing the model, you must ensure that the output of the last decoder is already replicated across tensors. Since this replication is enforced earlier, no additional code is required in the `LMHead` forward pass to handle it.
 
 ```py
 def forward(self, x: ttnn.Tensor):
@@ -972,19 +967,18 @@ def forward(self, x: ttnn.Tensor):
     return output
 ```
 
-
 ### 2.8 Model
 
 <div align="center">
 <img src="images/2.8-llama-model.png" alt="Llama model" title="Llama model" width="350" height="350">
 </div> <br>
 
-Once the model components (discussed in previous sections) are implemented, there isn’t much left to finalize. In our implementation, embeddings are managed outside the model class, as explained in [Section 2.1 Embedding](#21-embedding).
+Once the previous model components are implemented, there isn’t much left to finalize. In our implementation, embeddings are managed outside the model class, as explained in [Section 2.1 Embedding](#21-embedding).
 
-The model’s constructor initializes N decoders (e.g. 80 for Llama3.1-70b), the `RMSNorm` and the `LMHead`, ensuring that weights for all components are loaded onto the appropriate devices.
+The model’s constructor initializes N decoders, for example 80 for Llama3.1-70b, the `RMSNorm` and the `LMHead`, ensuring that weights for all components are loaded onto the appropriate devices.
 
-During the forward pass, the decoders are executed sequentially, followed by normalization and the `LMHead` computation at the end.
-A specific optimization is applied for the prefill mode: since only the last token is relevant, the `LMHead` is executed only on the final tile in this mode.
+During the forward pass, decoders are executed sequentially, followed by normalization and `LMHead` computation at the end.
+A specific optimization is applied for the prefill mode; only the last token is relevant, the `LMHead` is executed only on the final tile.
 
 In prefill mode, the RMSNorm output is interleaved, but the LMHead requires a sharded tensor. To accommodate this, the `interleaved_to_sharded` function is used to prepare the output accordingly.
 
@@ -1026,31 +1020,31 @@ def forward(
 ## 3. Features
 ### 3.1 Generative Decoding
 
-Almost every LLM generates text in the same manner: Given a prompt from the user, the LLM predicts the next token. Then, the LLM takes that new token and uses it as context to predict the following token. This process repeats until the LLM generates a token that indicates the end of the sequence, or until the user decides to stop the generation. The process is called "autoregressive generation" because each new token is used to predict the next token.
+Almost every LLM generates text in the same manner; given a prompt from the user, the LLM predicts the next token. Then, the LLM takes that new token and uses it as context to predict the following token. This process repeats until the LLM generates a token that indicates the end of the sequence, or until the user decides to stop the generation. The process is called "autoregressive generation" because each new token is used to predict the next token.
 
-#### Model Inputs and Outputs
+#### 3.1.1 Model Inputs and Outputs
 Inputs to the model for generative decoding are generally:
-- tokens: produced by the tokenizer
-- position ids: the position of the tokens in the sequence
-- KV cache: an inference optimization that caches intermediate values
+- **Tokens:** Produced by the tokenizer.
+- **Position IDs:** Position of the tokens in the sequence.
+- **KV Cache:** Inference optimization that caches intermediate values.
 
-In the model, tokens are embedded from the vocabulary space to the embedding space. Position ids are necessary for updating the KV cache and for positional embeddings like RoPE.
+In the model, tokens are embedded from the vocabulary space to the embedding space. Position IDs are necessary for updating the KV cache and for positional embeddings like RoPE.
 
 The model outputs:
-- logits for the next token
-- an updated KV cache
+- Logits for the next token
+- Updated KV Cache
 
-The logits are unnormalized probabilities over the vocabulary. Given these probabilities, the sampler must decide which of these tokens in the vocabulary will be chosen. There are a few sampling methods that are commonly used to pick the next token:
-- Greedy decoding (argmax of the logits, picks the most likely next token)
-- Top-p/top-k sampling (restricts the logits according to p and k values, then samples according to the remaining probabilities)
+The logits are unnormalized probabilities over the vocabulary. Given these probabilities, the sampler must decide which of these tokens in the vocabulary are chosen. There are a few sampling methods that are commonly used to pick the next token:
+- **Greedy Decoding:** Argmax of the logits, picks the most likely next token.
+- **Top-p/top-k Sampling:** Restricts the logits according to P and K values, then samples according to the remaining probabilities.
 
-#### KV cache
-The KV cache is an inference optimization. It allows us to cache some intermediate values during the first inference step which are reused in later steps.
+#### 3.1.2 KV Cache
+The KV cache is an inference optimization. It allows us to cache intermediate values during the first inference step for reuse in later steps.
 On the first inference step, the model processes the full prompt and caches the K and V projections for each layer. Subsequent inference steps compute a Q, K, V projection only for the new token, then use the cached K and V projections in attention. Therefore the first step (prefill) creates the KV cache and subsequent steps (decode) use and update the cache.
 
-The size of the KV cache depends on the batch size and sequence length. Since accelerators have finite memory, it can be necessary to tradeoff batch size and sequence length to allow the KV cache to fit in memory.
+The size of the KV cache depends on the batch size and sequence length. Since accelerators have finite memory, it is necessary to tradeoff batch size and sequence length to allow the KV cache to fit in memory.
 
-#### Batching
+#### 3.1.3 Batching
 LLMs use batching to process multiple sequences in parallel. There are a few reasons why batching is useful:
 - Real-world LLM services need to handle multiple concurrent requests.
 - LLM inference is bound by time to read model weights from DRAM. Batching allows model weight reuse across multiple sequences.
@@ -1060,15 +1054,15 @@ However, there are tradeoffs with batching. In decode mode, latency scales subli
 
 It is typical to use different batch sizes for different use cases, depending on the goal of the system.
 
-#### Performance Metrics
-**Time to first token (TTFT)** measures the latency to generate the first token of the sequence. This is the time to prefill a prompt and generate the first token. It is a measure of interactivity.
-
-**Total throughput (tokens per second)** tells us the total number of tokens that the model can generate per second. `total throughput = batch size / decode step latency`. Total throughput is important for cost-sensitive systems or offline processing, where interactivity is less important than throughput. Generally, increasing batch size will increase total throughput.
+#### 3.1.4 Performance Metrics
+**Time to First Token (TTFT):** measures the latency to generate the first token of the sequence. This is the time to prefill a prompt and generate the first token. It is a measure of interactivity.
 
-**User throughput (tokens per second per user)** is calculated as `user throughput = 1 / decode step latency`. User throughput tells us how interactive the model is, and tells us how fast the generation is for a single user. Generally, decreasing batch size will increase user throughput.
+**Total Throughput (Tokens per Second):** tells us the total number of tokens that the model can generate per second. `total throughput = batch size / decode step latency`. Total throughput is important for cost-sensitive systems or offline processing, where interactivity is less important than throughput. Generally, increasing batch size will increase total throughput.
 
-Note that each of these metrics change with batch size and sequence length. When reporting TTFT, total throughput, and user throughput, the batch size and sequence length must be specified.
+**User Throughput (Tokens per Second per User):** is calculated as `user throughput = 1 / decode step latency`. User throughput tells us how interactive the model is, and tells us how fast the generation is for a single user. Generally, decreasing batch size will increase user throughput.
 
+> [!NOTE]
+> Each of these metrics change with batch size and sequence length. When reporting TTFT, total throughput, and user throughput, the batch size and sequence length must be specified.
 
 ### 3.2 Prefill and Decode
 
@@ -1078,15 +1072,15 @@ In our LLM implementations, the prefill phase is done sequentially for each user
 
 The decode phase is parallel-computed for all users, but sequential for each token within a batch of users. Each new token can only be generated after the previous one, as the model must maintain causality in attention computations.
 
-#### **Technical Implementation Differences**
+#### 3.2.1 Technical Implementation Differences
 
-The intermediate activations in prefill mode are kept in DRAM, due to the large size of the tensors which contain the entire sequence length. In decode mode, the intermediate activations are kept in L1 memory instead, since in this mode the sequence length to compute is just 1 (one token at the time), reducing latency.
+The intermediate activations in prefill mode are kept in DRAM, due to the large size of the tensors which contain the entire sequence length. In decode mode, the intermediate activations are kept in L1 memory instead, since in this mode the sequence length to compute is just one token at the time, reducing latency.
 
-##### 1. Reshaping for Large Matrix Multiplications
+##### 3.2.1.1 Reshaping for Large Matrix Multiplications
 
 Please see the [attention source code](../../models/demos/llama3/tt/llama_attention.py) for reference.
 
-In prefill mode, when the input sequence length is very large, the model reshapes its input tensors to process sequences in smaller chunks in parallel for larger matrix multiplications, such as `wqkv`, `wo` in the attention module, and `w1`, `w2`, `w3` in the MLP module. This reshaping prevents running out of memory in cases of long prefill sequence lengths. For instance:
+In prefill mode, when the input sequence length is very large, the model reshapes its input tensors to process sequences in smaller chunks in parallel for larger matmuls, such as `wqkv`, `wo` in the attention module, and `w1`, `w2`, `w3` in the MLP module. This reshaping prevents running out of memory in cases of long prefill sequence lengths. For example:
 
 ```python
 if seq_len > 2048:
@@ -1102,9 +1096,9 @@ xqkv_fused = ttnn.linear(
 )
 ```
 
-This reshaping is not needed for decode mode because it only processes one token at a time. Instead, the parallelization for decode mode is done over user batches, which currently only goes up to 32.
+Reshaping is not needed for decode mode because it only processes one token at a time. Instead, the parallelization for decode mode is done over user batches, which currently only goes up to 32.
 
-##### 2. KV Cache Management
+##### 3.2.1.2 KV Cache Management
 
 The KV-cache is filled during prefill using the `ttnn.experimental.paged_fill_cache` operation. This supports page tables, which enables the hot-swapping of new users when the full model is deployed.
 
@@ -1130,8 +1124,8 @@ ttnn.experimental.paged_update_cache(
 )
 ```
 
-##### 3. Attention Computation
-###### Prefill:
+##### 3.2.1.3 Attention Computation
+###### 3.2.1.3.1 Prefill:
 ```python
 # Split q_heads into num_groups and kv_heads for parallel group computation for grouped query attention (GQA)
 q_heads_84SD_8b = ttnn.reshape(
@@ -1149,7 +1143,7 @@ attn_output_84SD = ttnn.transformer.scaled_dot_product_attention(
 )
 ```
 
-###### Decode:
+###### 3.2.1.3.2 Decode:
 ```python
 # Decode uses cached states instead of recomputing
 attn_output_11BH = ttnn.transformer.scaled_dot_product_attention_decode(
@@ -1160,27 +1154,27 @@ attn_output_11BH = ttnn.transformer.scaled_dot_product_attention_decode(
 )
 ```
 
-##### 4. Slicing Before the LM Head
-At the end of prefill, the model should generate the first decoded token, then signaling the start of the decode phase. To this end, the model slices the output of the last decoder layer to the last tile before computing the LM head. This is necessary because only last token from prefill is needed to start the autoregressive decoding.
+##### 3.2.1.4 Slicing Before the LM Head
+At the end of prefill, the model should generate the first decoded token, then signal the start of the decode phase. To this end, the model slices the output of the last decoder layer to the last tile before computing the LM head. This is necessary because only the last token from prefill is needed to start the autoregressive decoding.
 
 ```python
 x = ttnn.slice(x, (0, 0, get_last_token, 0), (1, 1, get_last_token + 32, x.shape[-1]))
 ```
 
-#### **Prefill vs. Decode: Comparison Summary**
+#### 3.2.2 Prefill vs. Decode: Comparison Summary
 
 |  | Prefill Mode | Decode Mode |
 | --- | --- | --- |
-| Purpose | Bulk sequence processing for initialization or training | Incremental processing for autoregressive inference |
-| Demo Parallelization | Sequential for each user, parallel for the sequence length of each user | Parallel for 32 users, sequential for each token within a batch of users |
-| Batch and sequence Length | Processes long sequences (≥ 128 tokens), single user | Processes batch of users (≤ 32 users), single token |
-| Memory Use | DRAM, with reshaping into smaller chunks for long sequence lengths | L1 on-chip memory for fast, low-latency processing |
-| Attention | Handles sequences in bulk; more memory-intensive | Incremental attention with precomputed components |
-| LM head slicing | Slices to last tile before Lm head matmul to extract the last token | Slicing not required |
+| **Purpose** | Bulk sequence processing for initialization or training. | Incremental processing for autoregressive inference. |
+| **Demo Parallelization** | Sequential for each user, parallel for the sequence length of each user. | Parallel for 32 users, sequential for each token within a batch of users. |
+| **Batch and Sequence Length** | Processes long sequences (≥ 128 tokens), single user. | Processes batch of users (≤ 32 users), single token. |
+| **Memory Use** | DRAM, with reshaping into smaller chunks for long sequence lengths. | L1 on-chip memory for fast, low-latency processing. |
+| **Attention** | Handles sequences in bulk; more memory-intensive. | Incremental attention with precomputed components. |
+| **LM Head Slicing** | Slices to last tile before Lm head matmul to extract the last token. | Slicing not required. |
 
 ### 3.3 Multi-Device
 
-Please note that this section refers to sharding schemes across devices and not on a multi-core level. For details about different matmul versions and sharding on a core level, please see the [matmul configuration section](#44-op-configs).
+This section refers to sharding schemes across devices and not on a multi-core level. For details about different matmul versions and sharding on a core level, please see: [matmul configuration section](#44-op-configs).
 
 There are two main approaches for scaling across multiple devices: `data parallel` and `tensor parallel`.
 
@@ -1190,14 +1184,12 @@ In tensor parallel scaling there is _one_ instance of the model executed on mult
 
 There are also hybrid forms of those two modes where a cluster of devices runs multiple independent instances of the model, but each of those model instances uses multiple chips in a tensor parallel fashion.
 
-In the report [Programming Mesh of Devices with TT-NN](../Programming_Mesh_of_Devices/Programming_Mesh_of_Devices_with_TT-NN.md), there is a good introduction to using TTNN's key concepts for scaling to multiple devices. It shows how to use a single handle for a mesh of devices, and how a tensor can be sharded or replicated to that mesh of devices (tensor parallelism).
+In the report [Programming Mesh of Devices with TT-NN](../Programming_Mesh_of_Devices/Programming_Mesh_of_Devices_with_TT-NN.md), there is a good introduction to using TT-NN's key concepts for scaling to multiple devices. It shows how to use a single handle for a mesh of devices, and how a tensor can be sharded or replicated to that mesh of devices (tensor parallelism).
 The tensor handle is used analogously to single device tensors, with the only difference being that all operations on that tensor are then executed in parallel on each device and operate on their respective local chunk of data.
 
-TT-Metal supports different multi-device topologies. The most important ones for us are `Ring` topology, where all devices are connected in a ring shape with each other, and `Line` topology, where a (sub-)group of devices is connected in a line with each other. `Line` topology can be a 1D or 2D grid of devices, where each row and column are connected in a line.
-
-Below is a summary and example code of the most important concepts for mapping a tensor to a mesh of devices in TTNN:
+TT-Metal supports different multi-device topologies. The most important ones for us are `Ring` topology, where all devices are connected in a ring shape with each other, and `Line` topology, where a subgroup of devices is connected in a line with each other. `Line` topology can be a 1D or 2D grid of devices, where each row and column are connected in a line.
 
-*Figure: Example usage of mesh_device, ShardTensorToMesh and ReplicateTensorToMesh*
+Below is a summary and example code of the most important concepts for mapping a tensor to a mesh of devices in TT-NN:
 
 ```python
 import ttnn
@@ -1227,18 +1219,18 @@ mesh_tensor_replicated = ttnn.from_torch(
 )
 ```
 
-The second key concept to scaling a model to multiple devices are Collective Communication Library (CCL) operations. They are used to efficiently exchange data between multiple devices. TTNN currently supports the following CCL Operations:
+The second key concept to scaling a model to multiple devices are Collective Communication Library (CCL) operations. They are used to efficiently exchange data between multiple devices. TT-NN currently supports the following CCL Operations:
 - AllGather
 - ReduceScatter
 - AllReduce
 
 See the [CCL Developer Guide](../EthernetMultichip/CclDeveloperGuide.md) for more comprehensive coverage about CCL and their implementation details. Our library of supported operations can be found [here](../EthernetMultichip/CclDeveloperGuide.md#op-list-op-list).
 
-#### AllGather
+#### 3.3.1 AllGather
 The AllGather operation collects data from all devices, concatenating each chunk along a specified dimension. The result is stored on each device (replication).
 
 - Supported Topologies: Ring, Linear
-- Supported number of links
+- Supported Number of Links
   - N300, T3000: 1
   - TG: 4 along cluster_axis=0, 3 along cluster_axis=1
 - Arguments
@@ -1248,7 +1240,7 @@ The AllGather operation collects data from all devices, concatenating each chunk
   - cluster_axis: cluster axis to gather along
   - mesh_device: mesh device the tensor is mapped to
 
-*Figure: Example usage of Ring All-Gather on 2x4 mesh_device*
+The following is an example of Ring All-Gather on s 2x4 mesh_device:
 
 ```py
 # Execute All-Gather on the sharded tensor
@@ -1256,7 +1248,7 @@ The AllGather operation collects data from all devices, concatenating each chunk
 output_tensor = ttnn.all_gather(mesh_tensor_sharded, dim=3, num_links=1)
 ```
 
-*Figure: Example usage of Linear All-Gather on 2x4 mesh_device*
+The following is an example of Linear All-Gather on a 2x4 mesh_device:
 
 ```py
 # Execute All-Gather on the sharded tensor
@@ -1264,19 +1256,19 @@ output_tensor = ttnn.all_gather(mesh_tensor_sharded, dim=3, num_links=1)
 output_tensor = ttnn.all_gather(mesh_tensor_sharded, dim=3, num_links=2, cluster_axis=1, mesh_device=mesh_device, topology=ttnn.Topology.Linear)
 ```
 
-#### ReduceScatter
-The ReduceScatter operation reduces the data across all devices and shards the result of the reduction over a specified dimension across all devices.
+#### 3.3.2 ReduceScatter
+The ReduceScatter operation reduces data across all devices and shards the result of the reduction over a specified dimension across all devices.
 
-- Supported Topologies: Ring, Linear
-- Supported number of links: 1
-- Arguments
-  - mesh_tensor: a tensor mapped to a mesh_device via mesh_mapper
-  - dim: the dimension to concatenate
-  - cluster_axis: cluster axis to gather along
-  - num_links: number of ethernet links to be used
-  - topology: topology configuration ttnn.Ring or ttn.Linear
+- **Supported Topologies:** Ring, Linear
+- **Supported Number of Links:** One
+- **Arguments:**
+  - **mesh_tensor:** a tensor mapped to a mesh_device via mesh_mapper
+  - **dim:** the dimension to concatenate
+  - **cluster_axis:** cluster axis to gather along
+  - **num_links:** number of ethernet links to be used
+  - **topology:** topology configuration ttnn.Ring or ttn.Linear
 
-*Figure: Example usage of Ring Reduce-Scatter on 2x4 mesh_device*
+The following is example usage of Ring Reduce-Scatter on a 2x4 mesh_device:
 
 ```py
 # Execute Reduce-Scatter on the sharded tensor
@@ -1284,7 +1276,7 @@ The ReduceScatter operation reduces the data across all devices and shards the r
 output_tensor = ttnn.reduce_scatter(mesh_tensor_sharded, dim=3, num_links=1)
 ```
 
-*Figure: Example usage of Linear Reduce-Scatter on 2x4 mesh_device*
+The following is example usage of Linear Reduce-Scatter on a 2x4 mesh_devcie:
 
 ```py
 # Execute Reduce-Scatter on the sharded tensor
@@ -1292,66 +1284,66 @@ output_tensor = ttnn.reduce_scatter(mesh_tensor_sharded, dim=3, num_links=1)
 output_tensor = ttnn.reduce_scatter(mesh_tensor_sharded, dim=3, num_links=1, cluster_axis=1, mesh_device=mesh_device, topology=ttnn.Topology.Linear)
 ```
 
-#### AllReduce
+#### 3.3.3 AllReduce
 The AllReduce operation reduces data across all devices and stores the entire tensor on each device (replication). It is performed using an AllGather followed by a ReduceScatter.
 
 A fused version of AllReduce is planned, but currently only the composite of AllGather+ReduceScatter is supported.
 
-#### Sharding schemes for decode
-In decode mode, activations are generally stored in L1 memory, while weights, which are too large, need to be stored in DRAM. The main bottleneck in decode mode is thereby DRAM bandwidth required to load model weights.
+#### 3.3.4 Sharding schemes for decode
+In decode mode, activations are generally stored in L1 memory, while weights, which are too large, must be stored in DRAM. The main bottleneck in decode mode is thereby DRAM bandwidth required to load model weights.
 
-The activations in decode mode are so small because they contain the batch size (=users) in the height dimension while sequence length is 1.
+The activations in decode mode are small because they contain the batch size (=users) in the height dimension while sequence length is one.
 The only exception is the attention operations computing `softmax(Q*KˆT)*V`. The activation width is the model dim (e.g. 8192 for Llama3-70b).
 Activations are not sharded in the height dimension; however, depending on the operation and model, they may be sharded in the width dimension.
 
-Matmul weights on the other hand can be sharded in width, height or both. Sharding weights across multiple devices significantly reduces DRAM pressure per device, resulting in notable latency improvements. Below is a summary of useful sharding schemes for sharding weights in decode mode. Which scheme to use will depend on the shape and size of the model weights and the target device topology.
+Matmul weights on the other hand can be sharded in width, height, or both. Sharding weights across multiple devices significantly reduces DRAM pressure per device, resulting in notable latency improvements. Below is a summary of useful sharding schemes for sharding weights in decode mode. Which scheme to use will depend on the shape and size of the model weights and the target device topology.
 
-##### **1D Column parallel**
+##### 3.3.5 1D Column parallel
 
-Weights are sharded in width, such that each device contains a horizontal slice of the weights. For this scheme the activations need to be gathered beforehead, i.e. each device processes the whole activation. The result of a column parallel matmul is an activation that is sharded in width. An AllGather operation is used on dim=3 to gather (i.e., replicate) activations.
+Weights are sharded in width, such that each device contains a horizontal slice of the weights. For this scheme the activations must be gathered beforehead, i.e. each device processes the whole activation. The result of a column parallel matmul is an activation that is sharded in width. Use an AllGather operation on dim=3 to gather (i.e., replicate) activations.
 
 <img src="images/column_parallel.png" style="width:500px;"/>
 
-##### **1D Row parallel**
+##### 3.3.6 1D Row parallel
 
-Weights are sharded in height, such that each device contains a vertical slice of the weights. For this scheme the activations need to be sharded beforehand, i.e. each device processes a width-shard of the activation. The result of a row parallel matmul are activation partials with the final result's output dimensions, each device containing a partial result. To reduce the activations, i.e. compute the final output, a ReduceScatter operation is used to compute the reduced result across all devices and shard the result along a specified dimension.
-Additionally an AllGather operation is used (ReduceScatter+AllGather = AllReduce) to gather the reduced shards and thus replicate the final output on each device.
+Weights are sharded in height, such that each device contains a vertical slice of the weights. For this scheme the activations must be sharded beforehand, i.e. each device processes a width-shard of the activation. The result of a row parallel matmul are activation partials with the final result's output dimensions, each device containing a partial result. To reduce the activations, i.e. compute the final output, use a ReduceScatter operation to compute the reduced result across all devices and shard the result along a specified dimension.
+Additionally use an AllGather operation (ReduceScatter+AllGather = AllReduce) to gather the reduced shards and thus replicate the final output on each device.
 
 <img src="images/row_parallel.png" style="width:500px;"/>
 
-##### **1D Column parallel followed by row parallel (1D weight sharding) **
+##### 3.3.7 1D Column Parallel Followed by Row Parallel (1D Weight Sharding)
 
-1D Weight Sharding is a sharding scheme that combines column and row parallel matmuls and can reduce the data volume sent over CCL operation and thus speed up computation. It consists of a column parallel matmul followed by a row parallel matmul. In this scheme the initial activations are gathered, and the column parallel matmul produces width-sharded outputs. The row parallel matmul consumes those sharded activations and produces parial outputs. We need an AllReduce (ReduceScatter+AllGather) operation to compute the final reduced and gathered outputs.
+1D Weight Sharding is a sharding scheme that combines column and row parallel matmuls and can reduce the data volume sent over CCL operation and thus speed up computation. It consists of a column parallel matmul followed by a row parallel matmul. In this scheme the initial activations are gathered, and the column parallel matmul produces width-sharded outputs. The row parallel matmul consumes those sharded activations and produces parial outputs. Use an AllReduce (ReduceScatter+AllGather) operation to compute the final reduced and gathered outputs.
 
-Optimization potential in this scheme depends highly on the input dimensions to the CCL operations. We can use this scheme for the MLP and any sequence of matmuls that expands and then narrows the output dimension again, becuase it moves the CCL operation to a more beneficial location in the computational graph and thus reduces the CCL data volume.
+Optimization potential in this scheme depends highly on the input dimensions to the CCL operations. Use this scheme for the MLP and any sequence of matmuls that expand and then narrow the output dimension again, because it moves the CCL operation to a more beneficial location in the computational graph and thus reduces the CCL data volume.
 
-Let's look at the MLP as concrete example: in Llama3-70b we have `FF1` and `FF3` with dimensions `[32, 8k] x [8k, 28k]` and then the `FF2` with dimension `[32, 28k] x [28k, 8k]`.
+Let's look at the MLP as a concrete example: in Llama3-70b we have `FF1` and `FF3` with dimensions `[32, 8k] x [8k, 28k]` and then the `FF2` with dimension `[32, 28k] x [28k, 8k]`.
 If we gather after `FF1` and `FF3` we have to gather activations of size `[32, 28k/num_devices] -> [32, 28k]` for each of `FF1` and `FF3`; after the `FF2` we'd need to gather again `[32, 8k/num_devices] -> [32, 8k]`.
 If instead, we use the 1D weight sharding scheme and thus move the CCL operation after the `FF2`, we only have to ReduceScatter #num_devices partials of size `[32, 8k] -> [32, 8k/num_devices]` and then optionally AllGather to obtain the `[32, 8k]` gathered outputs.
 
 <img src="images/column_parallel_then_row_parallel.png" style="width:700px;"/>
 
-##### **2D Weight Sharding**
+##### 3.3.8 2D Weight Sharding
 
 In 2D Weight Sharding on a 2D cluster, weights are sharded both in width and height, such that each device contains a block of the weights.
 For this scheme the activations are width-sharded along `cluster_axis=0` and are replicated along `cluster_axis=1`, and the weights are block-sharded. Thus, each device processes a width-shard of the activation, and a block of the weights where the activations are replicated over one axis but the weights are not.
 The matmul result will be width-sharded along `cluster_axis=0` and contain partial results along `cluster_axis=1`.
-Typically an AllReduce (ReduceScatter+AllGather) is used to first reduce along `cluster_axis=1` and then gather the shards along `cluster_axis=0`.
+Typically we use an AllReduce (ReduceScatter+AllGather) to first reduce along `cluster_axis=1` and then gather the shards along `cluster_axis=0`.
 
 <img src="images/block_sharded.png" style="width:1000px;"/>
 
-##### **Optimal strategy**
+##### 3.3.9 Optimal strategy
 
-The optimal usage strategy of different parallelisation schemes depends on the specific shapes and model architecture, as well as the target device topology. To select the best parallelisation strategy, the overall data movement for each scheme can be computed; selecting the parallelisation stratgy with the lowest overall data movement will generally result in the best performance.
+The optimal usage strategy of different parallelization schemes depends on the specific shapes and model architecture, as well as the target device topology. To select the best parallelization strategy, the overall data movement for each scheme can be computed; selecting the parallelization stratgy with the lowest overall data movement will generally result in the best performance.
 
-To compute the data movement for a given parallelisation strategy, first the required sequence of parallelisation strategies and corresponding CCL operations is sketched out, and then the resulting dat movement is computed. The following table shows constraints on input and output activations for each parallelisation strategy. A partial activation always has to be reduced (ReduceScatter or AllReduce), while fractured activations may or may not need to be gathered, dependent on the consumer operation. A binary op for example is executed on the fractured activaiton to parallelise computation, while a matmul 1D column parallel operation requires inputs to be gathered in k.
+To compute the data movement for a given parallelization strategy, first the required sequence of parallelization strategies and corresponding CCL operations is sketched out, and then the resulting dat movement is computed. The following table shows constraints on input and output activations for each parallelization strategy. A partial activation always has to be reduced (ReduceScatter or AllReduce), while fractured activations may or may not need to be gathered, dependent on the consumer operation. A binary OP for example is executed on the fractured activaiton to parallelize computation, while a matmul 1D column parallel operation requires inputs to be gathered in k.
 
-| Parallelisation strategy  | Input activation requirement | Output activation requirement |
+| Parallelization strategy  | Input activation requirement | Output activation requirement |
 |---------------------------|-----------------|-----------------|
-| 1D Column parallel        | Gathered in k   | Fractured in k |
-| 1D row parallel           | Fractured in k  | Partials of full size |
-| 1D column + row parallel  | Gathered in k   | Partials of full size |
-| 2D parallel               | Fractured in k  | Partials over one cluster axis |
+| 1D Column Parallel        | Gathered in K   | Fractured in K |
+| 1D Row Parallel           | Fractured in K  | Partials of full size |
+| 1D Column + Row Parallel  | Gathered in K   | Partials of full size |
+| 2D Parallel               | Fractured in K  | Partials over one cluster axis |
 
 The overall data movement (DM) is then computed using:
 
@@ -1360,21 +1352,19 @@ The overall data movement (DM) is then computed using:
 | AllGather         | DM = (K⋅N⋅DF/D)⋅(D−1)⋅D  | DM = (K⋅N⋅DF)⋅D⋅log2(D)   |
 | ReduceScatter     | DM = (K⋅N⋅DF)⋅(1-(1/D))    | DM = (K⋅N⋅DF) ⋅ (D-1) / D |
 
-where K and N are height and width of the weight tensor, DF is the data format multiplyer (number of bytes per datum) and D is the number of devices along the axis that the CCL operation is performed on. Ring topology is more optimised and results in less overall data movement.
-
+Where K and N are height and width of the weight tensor, DF is the data format multiplyer (number of bytes per datum) and D is the number of devices along the axis that the CCL operation is performed on. Ring topology is more optimized and results in less overall data movement.
 
-
-##### **Examplary parallelisation scheme: Llama3**
+##### 3.3.10 Examplary parallelization scheme: Llama3
 
 For our [Llama3 family of models](../../models/demos/llama3) we are using the following sharding schemes in our multi-device architectures:
 
 | Matmul            | N300            | T3000           | TG              |
 |-------------------|-----------------|-----------------|-----------------|
-| [_QKV projection_](../../models/demos/llama3/tt/llama_attention.py) | Column parallel | Column parallel | 2D              |
-| [_Dense out_](../../models/demos/llama3/tt/llama_attention.py)  | Row parallel    | Row parallel    | 2D              |
-| [_FF1_](../../models/demos/llama3/tt/llama_mlp.py)             | Column parallel | Column parallel | 2D              |
-| [_FF3_](../../models/demos/llama3/tt/llama_mlp.py)             | Column parallel | Column parallel | 2D              |
-| [_FF2_](../../models/demos/llama3/tt/llama_mlp.py)             | Row parallel    | Row parallel    | 2D              |
+| [_QKV projection_](../../models/demos/llama3/tt/llama_attention.py) | Column Parallel | Column Parallel | 2D              |
+| [_Dense out_](../../models/demos/llama3/tt/llama_attention.py)  | Row Parallel    | Row Parallel    | 2D              |
+| [_FF1_](../../models/demos/llama3/tt/llama_mlp.py)             | Column Parallel | Column Parallel | 2D              |
+| [_FF3_](../../models/demos/llama3/tt/llama_mlp.py)             | Column Parallel | Column Parallel | 2D              |
+| [_FF2_](../../models/demos/llama3/tt/llama_mlp.py)             | Row Parallel    | Row Parallel    | 2D              |
 
 
 ### 3.4 Continuous Batching
@@ -1382,7 +1372,7 @@ Continuous batching is a serving optimization. To describe continuous batching,
 
 Without continuous batching, an LLM service waits for `batch_size` requests to come in. The service then prefills each request. Then, the service decodes the batched requests token by token. Once all users in the batch finish generation, the service accepts new requests. This is suboptimal because 1) some requests might end generation early, so 2) some slots in the batch are not doing useful computation, while 3) new requests are waiting.
 
-In contrast, continuous batching allows the service to process new requests as soon as there is a free slot in the batch. The pseudo-code for this algorithm is shown below.
+In contrast, continuous batching allows the service to process new requests as soon as there is a free slot in the batch. The pseudo-code for this algorithm is shown below:
 
 ```python
 while True:
@@ -1399,27 +1389,27 @@ The above image from anyscale (https://www.anyscale.com/blog/continuous-batching
 
 Continuous batching improves TTFT by reducing wait times for incoming users. It also increases total throughput by keeping the decode batch full of useful work.
 
-Continuous batching is an LLM serving optimization but it requires some support in the model. The model has to support single user prefill so that when a slot is open, the model can prefill a new request into a specific slot of the batch. The model also has to support batched decode where position ids can be different for each user in the batch, to avoid context contamination.
+Continuous batching is an LLM serving optimization but it requires some support in the model. The model has to support single user prefill so that when a slot is open, the model can prefill a new request into a specific slot of the batch. The model also has to support batched decode where position IDs can be different for each user in the batch, to avoid context contamination.
 Implementing continuous batching requires that the serving code track data for each slot of the batch. An example of our continuous batching demo can be found [here](../../models/demos/t3000/llama2_70b/demo/demo_continuous_batching.py). In production deployment, vLLM handles continuous batching for the LLM service.
 
 ### 3.5 vLLM Integration
 
-#### Overview
+#### 3.5.1 Overview
 vLLM is an [open-source LLM serving library](https://github.com/vllm-project/vllm). We use vLLM to serve our models in production because of the features it enables. On the serving side, vLLM supports continuous batching and [paged attention](https://arxiv.org/pdf/2309.06180). In addition, vLLM provides an OpenAI-compatible server which is useful for deployment.
 
 Tenstorrent maintains a [fork of vLLM](https://github.com/tenstorrent/vllm/tree/dev) for serving models on Tenstorrent hardware. The [README](https://github.com/tenstorrent/vllm/tree/dev/tt_metal/README.md) has instructions for setting up the environment.
 
-#### Implementation Requirements
+#### 3.5.2 Implementation Requirements
 In order to add vLLM support to a new model, the model must conform to a certain interface. An example of the interface is the [Llama2-70b generation code](../../models/demos/t3000/llama2_70b/tt/llama_generation.py), which implements `prefill_forward`, `decode_forward`, and `initialize_vllm_model`.
 Beyond implementing the functionality needed for continuous batching, a model must also implement paged attention. For an example, see [Llama2-70b attention](../../models/demos/t3000/llama2_70b/tt/llama_attention_optimized.py).
 
-#### vLLM modifications
+#### 3.5.3 vLLM modifications
 On the vLLM side there may be additional changes needed to support the new model.
 
 - Modify [`tt_loader.py`](https://github.com/tenstorrent/vllm/blob/dev/vllm/model_executor/model_loader/tt_loader.py) if the model requires a different initialization.
 - Modify [`tt_model_runner.py`](https://github.com/tenstorrent/vllm/blob/dev/vllm/worker/tt_model_runner.py) if it is missing functionality for the new model.
 
-#### Testing
+#### 3.5.4 Testing
 Finally, test the new model through vLLM. Register the new model as seen in [`offline_inference_tt.py`](https://github.com/tenstorrent/vllm/blob/dev/examples/offline_inference_tt.py).
 
 ```python
@@ -1476,7 +1466,7 @@ output = ttnn.linear(
 When you don't pass memory configs or program configs the operation will choose default values. These defaults are often sub-optimal. `memory_config` typically defaults to a DRAM interleaved configuration, while `program_config` defaults to something reasonable but still sub-optimal.
 See [Matrix Engine](../matrix_engine/matrix_engine.md) for background on `compute_kernel_config`.
 
-#### Memory Configs
+#### 4.4.1 Memory Configs
 For the LLM context, memory configs are not as important in prefill mode, where activations are large (due to the long sequence lengths) and thus should generally be DRAM interleaved (otherwise wouldn't fit on L1). In prefill mode, each op should consume DRAM interleaved inputs and produce DRAM interleaved output(s).
 
 Memory configs are most important in decode mode. For some operation like `ttnn.matmul`, both the activation and the output will be sharded according to their memory configs. Decode mode activations are of shape `[batch_size, hidden_size]` and should be width-sharded in L1 (sharding the `hidden_size` dimension). By keeping activations and outputs width-sharded in L1 we reduce DRAM traffic and get better performance. The Llama3 codebase has examples of how to create a width-sharded memory config (see [Llama3 model config](../../models/demos/llama3/tt/model_config.py)).
@@ -1504,12 +1494,12 @@ As always, you should try running your `ttnn` op in a unit test with whichever s
 
 Be careful when your memory config creates shards that require padding (i.e, the shard shape does not divide evenly into 32x32 tiles). Padded shards and padded ops are under active development and can be sources of bugs. When your memory config requires padding, you probably want to instead find a core grid which divides evenly into the tensor shape.
 
-#### Program Configs and Picking the Right Matmul
+#### 4.4.2 Program Configs and Picking the Right Matmul
 Each `ttnn` operation has its own unique program config class. In general, program configs configure the op with hyperparameters that affects their functionality and performance. There are too many ops and program configs to cover in detail. We will focus on `ttnn.matmul` since it has multiple variants and it usually requires the most care.
 
 Picking a matmul variant is a key decision in optimizing a model. The choice depends on the shapes of the inputs and outputs and how the matmul fits into the rest of the model. You choose a variant by providing a specific `program_config` to `ttnn.matmul`. The following presents three matmul variants that are commonly used in LLMs.
 
-##### Matmul 2D
+##### 4.4.3 Matmul 2D
 Matmul 2D gets its name because it parallelizes an `(M x K) @ (K x N)` matmul over the M and N dimensions. It is useful to have this 2D parallelization when M and N are large (usually >= 256). Rule of thumb: use matmul 2D for all matmuls in prefill mode. Generally, inputs and output to matmul 2D will be interleaved in DRAM because these matmuls should be compute bound rather than memory bound and the inputs may be too large to fit in L1. NOTE: the weights can be DRAM sharded and still work with matmul 2D.
 
 The following is a description of the program config for matmul 2D.
@@ -1557,7 +1547,7 @@ fuse_batch=False,
 
 Since we use matmul 2D for large matmuls, there may be some issues where we run out of L1 just to store intermediate values in the kernel. When this happens, try reducing `in0_block_w` and `out_subblock_h` and `out_subblock_w`.
 
-##### DRAM-Sharded Matmul
+##### 4.4.4 DRAM-Sharded Matmul
 DRAM-Sharded matmul should be used in decode mode, where activations are small and DRAM-bandwidth to read weights is the limiting factor in op performance. This matmul gets its name because rather than having weights interleaved in DRAM, they are sharded across DRAM banks to optimally collocate weights with compute. See the [DRAM-Sharded Matmul](../Saturating_DRAM_bandwidth/Saturating_DRAM_bandwidth.md) writeup for details on the implementation.
 
 We use DRAM-Sharded matmul for all matmuls in decode mode. The activation and output are width-sharded in L1, and the weights are width-sharded in DRAM.
@@ -1595,7 +1585,7 @@ output = ttnn.linear(
 
 Be careful that the core grid evenly divides both the activations and the output. Padding functionality is not yet implemented for DRAM-Sharded matmuls.
 
-#### Matmul 1D
+#### 4.4.5 Matmul 1D
 Matmul 1D is the final variant to cover. Before ttnn implemented DRAM-Sharded matmul, this was the matmul of choice for decode mode. Now that DRAM-Sharded matmul exists and is much faster, matmul 1D is less often used.
 Matmul 1D gets its name because it only parallelizes over the N dimension. The activation and output(s) should be width-sharded in L1. Weights should be DRAM interleaved.
 
@@ -1623,7 +1613,7 @@ When creating a matmul 1D program config, maximize the `in0_block_w` and `out_su
 
 While we work on maximizing the performance of large language models on Tenstorrent hardware, we must also ensure that the models are functionally correct and that they produce outputs of the expected quality. The subsections below will describe our methods for evaluating the accuracy (also referred to as functionality or correctness for our purposes) of a given model and how to debug issues pertaining to this.
 
-#### Accuracy Testing
+#### 4.5.1 Accuracy Testing
 
 Below is a list of metrics that are used when evaluating accuracy:
 - **Pearson Correlation Coefficient (PCC)**: A measure of the linear relationship between two variables, where a PCC of 1 indicates a perfect positive correlation, and a PCC of 0 indicates no linear correlation.
@@ -1637,7 +1627,7 @@ In order to thoroughly test the accuracy of a model, a bottom up approach is tak
 - **Model-level unit tests**: In addition to the sub-module unit tests, there should also be unit tests for a full layer of the model with all sub-modules, and the full model comprising of all layers. For example, the [llama3 model test](https://github.com/tenstorrent/tt-metal/blob/main/models/demos/llama3/tests/test_llama_model.py) runs 1 or many layers of the model over multiple iterations and checks the PCC against the reference model. A rule of thumb is that the full model PCC should be approximately ~0.99.
 - **Dataset evaluation**: Once a model has been brought up with sufficient accuracy on the smaller unit tests, it should be tested on a larger set of prompts such as a full dataset or a subset of it. For example, the [Falcon7b perplexity test](https://github.com/tenstorrent/tt-metal/blob/main/models/demos/falcon7b_common/tests/perplexity/test_perplexity_falcon.py) loads a subset of the [WikiText dataset](https://huggingface.co/datasets/Salesforce/wikitext) and computes several metrics (including perplexity and top-1/5 accuracy) for evaluating the TT model with respect to the ground truth from the dataset. The results of these metrics should be comparable (e.g. within a couple percentage points of difference) to those obtained from running the evaluation with the reference model on CPU / GPU.
 
-#### Debugging Accuracy
+#### 4.5.2 Debugging Accuracy
 
 If during model bringup or optimization it is observed that the model outputs do not seem reasonable or any of the evaluations above are failing, the following steps can be taken to debug the accuracy:
 1. Locate the smallest module test that is failing. The fewer the number of operations that could be causing the issue, the easier it will be to debug the root cause. In most cases, the issue should be able to be found using a 1 layer or submodule test.
@@ -1658,7 +1648,7 @@ In some cases, it may be possible that the issue is not with the model and that
 
 ### 4.6 Performance Analysis
 
-ttnn performance has five components:
+TT-NN performance has five components:
 
 ![Performance components overview](images/4.6-overview.png)
 
@@ -1670,9 +1660,10 @@ ttnn performance has five components:
 
 Further detail will be provided. It is important to confirm that Tracing has been enabled. For more inforation see [4.1 Tracing](#41-tracing) for more details, tracing should be used for decode mode but not prefill mode.
 
-**This means that for decode mode you won’t have to worry about 1-3 but for prefill mode you will.**
+> [!NOTE]
+> This means that for decode mode you won’t have to worry about 1-3 but for prefill mode you will.
 
-#### 1. Main Python Thread
+#### 4.6.1 Main Python Thread
 
 Implement the main python thread if you are not tracing. The main python thread is not important if you are using tracing. The Metal Profiler/Tracy can also show python performance but for pure python analysis Viztracer is a recommended tool. [viztracer](https://github.com/gaogaotiantian/viztracer):
 
@@ -1704,11 +1695,11 @@ Top tips:
 * Generate shard spec and compute kernel config objects once (e.g. in a constructor) instead of recreating them every time you run the forward pass. Keep the forward pass clean.
 * Make sure Metal is compiled in Release mode (default) and you are using ttnn’s async mode (see above).
 
-#### 2. Host API
+#### 4.6.2 Host API
 
 Any overhead here is outside your control and in our experience is minimal. Use a C++ profiler or [Metal Profiler/Tracy](https://github.com/tenstorrent/tt-metal/blob/main/tech_reports/MetalProfiler/metal-profiler.md) with host stack traces enabled to see this time.
 
-#### 3. Host-device communications
+#### 4.6.3 Host-device communications
 
 As little communication as possible between the host and the device is preferred. For LLMs this means:
 
@@ -1734,7 +1725,7 @@ Looking at host-device communications in a python profiler like `viztracer` is p
 
 If you want to measure calls this way, turn async mode off. The time your main python thread spends in `to_torch` will not include any time spent waiting for the device and will be a closer approximation the measures above.
 
-#### 4+5. Device dispatch and op performance
+#### 4.6.4 Device dispatch and OP performance
 
 This is the fun bit, but we need to do a little prep to get started. First, metal must be compiled with `-p` to enable device profiling:
 
@@ -1762,7 +1753,7 @@ python models/perf/perf_report.py OPS_CSV_FILE
 
 For device performance we recommend looking at a single layer. You can do this by using `--id-range` or by changing your test to run only a single layer of the model. For more information see: [Performance Report Analysis Tool](https://github.com/tenstorrent/tt-metal/tree/main/models/perf). The Performance Report Analysis Tool document describes how to select specific ranges of OPs.
 
-##### What makes a good performance test?
+**What makes a good performance test?**
 
 Ideally you should run your model in as close to end-user form as possible, simplifying it as much as possible. In practice this means:
 
@@ -1771,7 +1762,7 @@ Ideally you should run your model in as close to end-user form as possible, simp
 * Run a single layer of the model - but be aware of which OPs are run for every layer and which ones are only run at the start and end (e.g. embedding, final norm and LM head).
 * Add a tracy signpost e.g. `tracy.signpost("Performance pass")` before the part you want to record - this will be focused on by default by `perf_report.py`, saving you some work.
 
-##### What does such a report look like?
+**What does such a report look like?**
 
 Here is an example without tracing enabled. You can instantly see that more time (756us) is spent in between OPs (op-to-op gap) than running OPs on device (362us)!
 
@@ -1799,7 +1790,7 @@ There are many individual tips, let’s start with overall advice:
 
 The perfect OP runs on the entire core grid using sharded inputs from L1. Let’s look more at data movement first, then specific tips.
 
-#### Data movement
+#### 4.7.1 Data movement
 
 OPs can read data from:
 
@@ -1816,7 +1807,7 @@ Activations are placed in L1 and weights placed in DRAM.
 
 See the [op config section](#44-op-configs) for more details on writing shard specs in your code.
 
-#### Specific tips
+#### 4.7.2 Specific tips
 
 Situation: OPs are reading from the fastest memory they can, sharded if possible. What might still make things slow?
 
@@ -1863,7 +1854,7 @@ self.compute_kernel_config_hifi2 = ttnn.WormholeComputeKernelConfig(
 As always, do not recreate these every single forward pass if you want your python thread to be fast (which you do).
 ### 4.8 Module Tests
 
-#### Llama3 Module and Test Differences
+#### 4.8.1 Llama3 Module and Test Differences
 
 In our current Llama3 model, the attention module class (`TtLlamaAttention`) implements two primary methods for attention computation: `forward_prefill` and `forward_decode`.
 To test these, we provide two separate attention test files, `test_attention_decode` and `test_attention_prefill`, which create the appropriate input tensors:
diff --git a/tests/nightly/tg/ccl/test_all_gather_async_nightly.py b/tests/nightly/tg/ccl/test_all_gather_async_nightly.py
new file mode 120000
index 00000000000..f342d96f5be
--- /dev/null
+++ b/tests/nightly/tg/ccl/test_all_gather_async_nightly.py
@@ -0,0 +1 @@
+../../../ttnn/unit_tests/operations/ccl/test_all_gather_async_TG_nightly.py
\ No newline at end of file
diff --git a/tests/nightly/tg/ccl/test_reduce_scatter_async_nightly.py b/tests/nightly/tg/ccl/test_reduce_scatter_async_nightly.py
new file mode 120000
index 00000000000..2187a4cc4fb
--- /dev/null
+++ b/tests/nightly/tg/ccl/test_reduce_scatter_async_nightly.py
@@ -0,0 +1 @@
+../../../ttnn/unit_tests/operations/ccl/test_reduce_scatter_async_TG_nightly.py
\ No newline at end of file
diff --git a/tests/scripts/tg/run_tg_nightly_tests.sh b/tests/scripts/tg/run_tg_nightly_tests.sh
index 89e5c253c7c..d3f23a6a50c 100755
--- a/tests/scripts/tg/run_tg_nightly_tests.sh
+++ b/tests/scripts/tg/run_tg_nightly_tests.sh
@@ -7,8 +7,7 @@ run_tg_llama3_70b_tests() {
 
   echo "LOG_METAL: Running run_tg_llama3_70b_tests"
 
-  pytest tests/nightly/tg/ccl/test_all_gather_nightly.py ; fail+=$?
-  pytest tests/nightly/tg/ccl/test_reduce_scatter_nightly.py ; fail+=$?
+  pytest -n auto tests/nightly/tg/ccl --timeout=180 ; fail+=$?
 
   # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size
   pytest tests/nightly/tg/models/demos/tg/llama3_70b ; fail+=$?
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/logical_and/logical_and_sharded.py b/tests/sweep_framework/sweeps/eltwise/binary/logical_and/logical_and_sharded.py
index 0b6bb8b0fff..c482877e082 100644
--- a/tests/sweep_framework/sweeps/eltwise/binary/logical_and/logical_and_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/binary/logical_and/logical_and_sharded.py
@@ -10,7 +10,11 @@
 import random
 import ttnn
 from tests.sweep_framework.sweep_utils.utils import gen_shapes, tensor_to_dtype, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
 
 from tests.ttnn.utils_for_testing import assert_equal, check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
-    pre_sharded_height = math.prod(input_shape[:-1])
-    pre_sharded_width = input_shape[-1]
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
 
     if input_layout == "ROW_MAJOR_LAYOUT":
-        return True, "Input to eltwise binary must be tilized"
-
-    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
-        return True, "bfloat8_b is only supported on tiled layout"
-
+        return True, "Inputs to eltwise binary must be tilized"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
     return False, None
 
 
@@ -74,6 +75,7 @@ def run(
         shard_orientation,
         tensor_hw_as_shard_shape,
         input_layout,
+        shard_height_mul_of_32,
     ) = parse_sharding_spec(input_spec)
 
     if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -94,6 +96,7 @@ def run(
         strategy=sharding_strategy,
         orientation=shard_orientation,
         use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
     )
 
     input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/logical_or/logical_or_sharded.py b/tests/sweep_framework/sweeps/eltwise/binary/logical_or/logical_or_sharded.py
index f32030aae75..826f3a52682 100644
--- a/tests/sweep_framework/sweeps/eltwise/binary/logical_or/logical_or_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/binary/logical_or/logical_or_sharded.py
@@ -10,7 +10,11 @@
 import random
 import ttnn
 from tests.sweep_framework.sweep_utils.utils import gen_shapes, tensor_to_dtype, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
 
 from tests.ttnn.utils_for_testing import assert_equal, check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
-    pre_sharded_height = math.prod(input_shape[:-1])
-    pre_sharded_width = input_shape[-1]
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
 
     if input_layout == "ROW_MAJOR_LAYOUT":
-        return True, "Input to eltwise binary must be tilized"
-
-    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
-        return True, "bfloat8_b is only supported on tiled layout"
-
+        return True, "Inputs to eltwise binary must be tilized"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
     return False, None
 
 
@@ -74,6 +75,7 @@ def run(
         shard_orientation,
         tensor_hw_as_shard_shape,
         input_layout,
+        shard_height_mul_of_32,
     ) = parse_sharding_spec(input_spec)
 
     if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -94,6 +96,7 @@ def run(
         strategy=sharding_strategy,
         orientation=shard_orientation,
         use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
     )
 
     input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/logical_xor/logical_xor_sharded.py b/tests/sweep_framework/sweeps/eltwise/binary/logical_xor/logical_xor_sharded.py
index 59e5cbf0572..cdb9e1c4473 100644
--- a/tests/sweep_framework/sweeps/eltwise/binary/logical_xor/logical_xor_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/binary/logical_xor/logical_xor_sharded.py
@@ -10,7 +10,11 @@
 import random
 import ttnn
 from tests.sweep_framework.sweep_utils.utils import gen_shapes, tensor_to_dtype, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
 
 from tests.ttnn.utils_for_testing import assert_equal, check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
-    pre_sharded_height = math.prod(input_shape[:-1])
-    pre_sharded_width = input_shape[-1]
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
 
     if input_layout == "ROW_MAJOR_LAYOUT":
-        return True, "Input to eltwise binary must be tilized"
-
-    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
-        return True, "bfloat8_b is only supported on tiled layout"
-
+        return True, "Inputs to eltwise binary must be tilized"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
     return False, None
 
 
@@ -74,6 +75,7 @@ def run(
         shard_orientation,
         tensor_hw_as_shard_shape,
         input_layout,
+        shard_height_mul_of_32,
     ) = parse_sharding_spec(input_spec)
 
     if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -94,6 +96,7 @@ def run(
         strategy=sharding_strategy,
         orientation=shard_orientation,
         use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
     )
 
     input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/ternary/addcdiv/addcdiv_sharded.py b/tests/sweep_framework/sweeps/eltwise/ternary/addcdiv/addcdiv_sharded.py
index 9eb1b8d2254..6bc3cde251e 100644
--- a/tests/sweep_framework/sweeps/eltwise/ternary/addcdiv/addcdiv_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/ternary/addcdiv/addcdiv_sharded.py
@@ -11,7 +11,11 @@
 import ttnn
 import math
 from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
-    pre_sharded_height = math.prod(input_shape[:-1])
-    pre_sharded_width = input_shape[-1]
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
 
     if input_layout == "ROW_MAJOR_LAYOUT":
-        return True, "Input to eltwise binary must be tilized"
-
-    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_dtype"] == ttnn.bfloat8_b:
-        return True, "bfloat8_b is only supported on tiled layout"
-
+        return True, "Inputs to eltwise binary must be tilized"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
     return False, None
 
 
@@ -72,6 +73,7 @@ def run(
         shard_orientation,
         tensor_hw_as_shard_shape,
         input_layout,
+        shard_height_mul_of_32,
     ) = parse_sharding_spec(input_spec)
 
     if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -99,6 +101,7 @@ def run(
         strategy=sharding_strategy,
         orientation=shard_orientation,
         use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
     )
 
     input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/ternary/addcmul/addcmul_sharded.py b/tests/sweep_framework/sweeps/eltwise/ternary/addcmul/addcmul_sharded.py
index 309c4466057..45ab2f8f4e0 100644
--- a/tests/sweep_framework/sweeps/eltwise/ternary/addcmul/addcmul_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/ternary/addcmul/addcmul_sharded.py
@@ -11,7 +11,11 @@
 import ttnn
 import math
 from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
-    pre_sharded_height = math.prod(input_shape[:-1])
-    pre_sharded_width = input_shape[-1]
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
 
     if input_layout == "ROW_MAJOR_LAYOUT":
-        return True, "Input to eltwise binary must be tilized"
-
-    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_dtype"] == ttnn.bfloat8_b:
-        return True, "bfloat8_b is only supported on tiled layout"
-
+        return True, "Inputs to eltwise binary must be tilized"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
     return False, None
 
 
@@ -72,6 +73,7 @@ def run(
         shard_orientation,
         tensor_hw_as_shard_shape,
         input_layout,
+        shard_height_mul_of_32,
     ) = parse_sharding_spec(input_spec)
 
     if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -99,6 +101,7 @@ def run(
         strategy=sharding_strategy,
         orientation=shard_orientation,
         use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
     )
 
     input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_sharded.py
index 65f83fb5545..08e6e827a8a 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_sharded.py
@@ -11,7 +11,11 @@
 import ttnn
 import math
 from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
-    pre_sharded_height = math.prod(input_shape[:-1])
-    pre_sharded_width = input_shape[-1]
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
 
     if input_layout == "ROW_MAJOR_LAYOUT":
-        return True, "Input to eltwise binary must be tilized"
-
-    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
-        return True, "bfloat8_b is only supported on tiled layout"
-
+        return True, "Inputs to eltwise binary must be tilized"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
     return False, None
 
 
@@ -72,6 +73,7 @@ def run(
         shard_orientation,
         tensor_hw_as_shard_shape,
         input_layout,
+        shard_height_mul_of_32,
     ) = parse_sharding_spec(input_spec)
 
     if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -90,6 +92,7 @@ def run(
         strategy=sharding_strategy,
         orientation=shard_orientation,
         use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
     )
 
     input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma.py b/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma.py
new file mode 100644
index 00000000000..55169b42e1b
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 32)
+        + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 32)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 32),
+        "input_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_layout": [ttnn.TILE_LAYOUT],
+        "input_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    input_dtype,
+    input_layout,
+    input_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    torch.manual_seed(0)
+
+    torch_input_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=0.0001, high=100, dtype=torch.float32), input_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.digamma)
+    torch_output_tensor = golden_function(torch_input_tensor)
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=input_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    result = ttnn.digamma(input_tensor, memory_config=output_memory_config)
+    output_tensor = ttnn.to_torch(result)
+    e2e_perf = stop_measuring_time(start_time)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma_sharded.py
new file mode 100644
index 00000000000..35273e221d0
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, max_tensor_size_per_core=20 * 1024, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=0.0001, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.digamma)
+    torch_output_tensor = golden_function(torch_input_tensor_a)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.digamma(input_tensor_a, memory_config=sharded_config)
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/elu/elu_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/elu/elu_sharded.py
new file mode 100644
index 00000000000..cc03eb013bf
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/elu/elu_sharded.py
@@ -0,0 +1,115 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(6, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "alpha": [-0.5, 0, 0.5],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    alpha,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    torch_output_tensor = torch.nn.functional.elu(torch_input_tensor_a, alpha=alpha)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.elu(input_tensor_a, alpha, memory_config=sharded_config)
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz.py b/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz.py
new file mode 100644
index 00000000000..e1bd10f6f75
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random_with_zeros
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 32)
+        + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 32)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 32),
+        "input_dtype": [ttnn.bfloat16],
+        "input_layout": [ttnn.TILE_LAYOUT],
+        "input_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    input_dtype,
+    input_layout,
+    input_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    torch.manual_seed(0)
+
+    torch_input_tensor = gen_func_with_cast_tt(
+        partial(torch_random_with_zeros, low=-10, high=10, dtype=torch.float32), input_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.eqz)
+    torch_output_tensor = golden_function(torch_input_tensor)
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=input_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    result = ttnn.eqz(input_tensor, memory_config=output_memory_config)
+    output_tensor = ttnn.to_torch(result)
+    e2e_perf = stop_measuring_time(start_time)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz_sharded.py
new file mode 100644
index 00000000000..0c52d028425
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random_with_zeros
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random_with_zeros, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.eqz)
+    torch_output_tensor = golden_function(torch_input_tensor_a)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.eqz(input_tensor_a, memory_config=sharded_config)
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/erf/erf_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/erf/erf_sharded.py
new file mode 100644
index 00000000000..e1fed6bc25a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/erf/erf_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.erf)
+    torch_output_tensor = golden_function(torch_input_tensor_a)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.erf(input_tensor_a, memory_config=sharded_config)
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/leaky_relu/leaky_relu_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/leaky_relu/leaky_relu_sharded.py
new file mode 100644
index 00000000000..88880a65e92
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/leaky_relu/leaky_relu_sharded.py
@@ -0,0 +1,116 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(6, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "negative_slope": [-0.5, 0, 0.01, 0.5],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    negative_slope,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.leaky_relu)
+    torch_output_tensor = golden_function(torch_input_tensor_a, negative_slope=negative_slope)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.leaky_relu(input_tensor_a, negative_slope, memory_config=sharded_config)
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/lez/lez_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/lez/lez_sharded.py
new file mode 100644
index 00000000000..2c4ffbf69c4
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/lez/lez_sharded.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    print(
+        f"{input_shape} {core_grid} {sharding_strategy} {shard_orientation} {tensor_hw_as_shard_shape} {input_a_dtype} {input_layout} {shard_height_mul_of_32}"
+    )
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.lez)
+    torch_output_tensor = golden_function(torch_input_tensor_a)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.lez(input_tensor_a, memory_config=sharded_config)
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    print(pcc)
+    return [pcc, e2e_perf]
+
+
+# # Run sweeps locally
+# from tests.sweep_framework.framework.permutations import *
+
+# start_time = start_measuring_time()
+# for suite in parameters.keys():
+#     device_id = 0
+#     device = ttnn.open_device(device_id=device_id)
+#     suite_vectors = list(permutations(parameters[suite]))
+#     print(len(suite_vectors))
+#     for vector in suite_vectors:
+#         invalidate_res = invalidate_vector(vector)
+#         if invalidate_res[0]:
+#             print(f"Invalidated: {invalidate_res[1]}")
+#             continue
+#         try:
+#             passed, _ = run(**vector, device=device)
+#             # if passed[0] != True:
+#             #     print(passed)
+#         except Exception as e:
+#             print(e)
+
+#     ttnn.close_device(device)
+
+# e2e_perf = stop_measuring_time(start_time)
+# print(f"time {e2e_perf / 1000000000}s")
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv_sharded.py
index acc77eba40a..75dcb176d1c 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv_sharded.py
@@ -11,7 +11,11 @@
 import ttnn
 import math
 from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
-    pre_sharded_height = math.prod(input_shape[:-1])
-    pre_sharded_width = input_shape[-1]
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
 
     if input_layout == "ROW_MAJOR_LAYOUT":
-        return True, "Input to eltwise binary must be tilized"
-
-    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
-        return True, "bfloat8_b is only supported on tiled layout"
-
+        return True, "Inputs to eltwise binary must be tilized"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
     return False, None
 
 
@@ -72,6 +73,7 @@ def run(
         shard_orientation,
         tensor_hw_as_shard_shape,
         input_layout,
+        shard_height_mul_of_32,
     ) = parse_sharding_spec(input_spec)
 
     if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -91,6 +93,7 @@ def run(
         strategy=sharding_strategy,
         orientation=shard_orientation,
         use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
     )
 
     input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal.py b/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal.py
new file mode 100644
index 00000000000..16404f972a1
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 32)
+        + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 32)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 32),
+        "input_dtype": [ttnn.bfloat16],
+        "input_layout": [ttnn.TILE_LAYOUT],
+        "input_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    input_dtype,
+    input_layout,
+    input_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    torch.manual_seed(0)
+
+    torch_input_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.reciprocal)
+    torch_output_tensor = golden_function(torch_input_tensor)
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=input_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=input_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    result = ttnn.reciprocal(input_tensor, memory_config=output_memory_config)
+    output_tensor = ttnn.to_torch(result)
+    e2e_perf = stop_measuring_time(start_time)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal_sharded.py
new file mode 100644
index 00000000000..137ff5755f9
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.reciprocal)
+    torch_output_tensor = golden_function(torch_input_tensor_a)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.reciprocal(input_tensor_a, memory_config=sharded_config)
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_sharded.py
index e6ff531b973..f7aec1d0ee4 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_sharded.py
@@ -11,7 +11,11 @@
 import ttnn
 import math
 from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
-    pre_sharded_height = math.prod(input_shape[:-1])
-    pre_sharded_width = input_shape[-1]
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
 
     if input_layout == "ROW_MAJOR_LAYOUT":
-        return True, "Input to eltwise binary must be tilized"
-
-    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
-        return True, "bfloat8_b is only supported on tiled layout"
-
+        return True, "Inputs to eltwise binary must be tilized"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
     return False, None
 
 
@@ -72,6 +73,7 @@ def run(
         shard_orientation,
         tensor_hw_as_shard_shape,
         input_layout,
+        shard_height_mul_of_32,
     ) = parse_sharding_spec(input_spec)
 
     if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -91,6 +93,7 @@ def run(
         strategy=sharding_strategy,
         orientation=shard_orientation,
         use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
     )
 
     input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/sin/sin_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/sin/sin_sharded.py
new file mode 100644
index 00000000000..aa8402d106e
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/sin/sin_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+    golden_function = ttnn.get_golden_function(ttnn.sin)
+    torch_output_tensor = golden_function(torch_input_tensor_a)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.sin(input_tensor_a, memory_config=sharded_config)
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/acos_bw/acos_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/acos_bw/acos_bw_sharded.py
new file mode 100644
index 00000000000..2ecdfcbe108
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/acos_bw/acos_bw_sharded.py
@@ -0,0 +1,128 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor.requires_grad = True
+    golden_function = ttnn.get_golden_function(ttnn.acos_bw)
+    torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor)[0]
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.acos_bw(grad_tensor, input_tensor, memory_config=sharded_config)[0]
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/add_bw/add_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/add_bw/add_bw_sharded.py
new file mode 100644
index 00000000000..fddb3d4360a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/add_bw/add_bw_sharded.py
@@ -0,0 +1,130 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor.requires_grad = True
+
+    scalar = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
+    golden_function = ttnn.get_golden_function(ttnn.add_bw)
+    torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor, scalar)[0]
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.add_bw(grad_tensor, input_tensor, scalar, memory_config=sharded_config)[0]
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/cos_bw/cos_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/cos_bw/cos_bw_sharded.py
new file mode 100644
index 00000000000..f45031b69c0
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/cos_bw/cos_bw_sharded.py
@@ -0,0 +1,128 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor.requires_grad = True
+    golden_function = ttnn.get_golden_function(ttnn.cos_bw)
+    torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor)[0]
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.cos_bw(grad_tensor, input_tensor, memory_config=sharded_config)[0]
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/fill_bw/fill_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/fill_bw/fill_bw_sharded.py
new file mode 100644
index 00000000000..c10f4c90c45
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/fill_bw/fill_bw_sharded.py
@@ -0,0 +1,128 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor.requires_grad = True
+    golden_function = ttnn.get_golden_function(ttnn.fill_bw)
+    torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor)
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.fill_bw(grad_tensor, input_tensor, memory_config=sharded_config)[0]
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor[0], output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/hardsigmoid_bw/hardsigmoid_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardsigmoid_bw/hardsigmoid_bw_sharded.py
new file mode 100644
index 00000000000..89fd978b802
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardsigmoid_bw/hardsigmoid_bw_sharded.py
@@ -0,0 +1,128 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+    gen_sharded_spec_unary,
+    parse_sharding_spec,
+    invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+    "nightly": {
+        "input_spec": gen_sharded_spec_unary(16, max_tensor_size_per_core=16 * 1024, layouts=["TILE_LAYOUT"]),
+        "input_a_dtype": [ttnn.bfloat16],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_layout = test_vector["input_spec"]["input_layout"]
+    sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+    if input_layout == "ROW_MAJOR_LAYOUT":
+        return True, "Input to eltwise binary must be tilized"
+    if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b is only supported on tiled layout"
+    if sharding_invalidated:
+        return sharding_invalidated, output_str
+
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_dtype,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    (
+        input_shape,
+        core_grid,
+        sharding_strategy,
+        shard_orientation,
+        tensor_hw_as_shard_shape,
+        input_layout,
+        shard_height_mul_of_32,
+    ) = parse_sharding_spec(input_spec)
+
+    if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+        input_shape = sanitize_shape_rm(input_shape)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape,
+        core_grid=core_grid,
+        strategy=sharding_strategy,
+        orientation=shard_orientation,
+        use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+        tile_layout=shard_height_mul_of_32,
+    )
+
+    torch_grad_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    torch_input_tensor.requires_grad = True
+    golden_function = ttnn.get_golden_function(ttnn.hardsigmoid_bw)
+    torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor)[0]
+
+    grad_tensor = ttnn.from_torch(
+        torch_grad_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=input_a_dtype,
+        layout=input_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    output_tensor = ttnn.hardsigmoid_bw(grad_tensor, input_tensor, memory_config=sharded_config)[0]
+    e2e_perf = stop_measuring_time(start_time)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    return [pcc, e2e_perf]
diff --git a/tests/tt_eager/integration_tests/test_bert.cpp b/tests/tt_eager/integration_tests/test_bert.cpp
index 60728c57d08..f6d72c7c85d 100644
--- a/tests/tt_eager/integration_tests/test_bert.cpp
+++ b/tests/tt_eager/integration_tests/test_bert.cpp
@@ -8,8 +8,8 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/normalization/softmax/softmax.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 #include "ttnn/operations/matmul/matmul.hpp"
 #include "ttnn/operations/normalization/layernorm/layernorm.hpp"
diff --git a/tests/tt_eager/ops/test_average_pool.cpp b/tests/tt_eager/ops/test_average_pool.cpp
index dbf1a1beca8..0608e998dfb 100644
--- a/tests/tt_eager/ops/test_average_pool.cpp
+++ b/tests/tt_eager/ops/test_average_pool.cpp
@@ -7,7 +7,7 @@
 #include "ttnn/operations/functions.hpp"
 
 #include "ttnn/tensor/tensor.hpp"
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 using tt::tt_metal::DataType;
 using tt::tt_metal::IDevice;
diff --git a/tests/tt_eager/ops/test_bcast_op.cpp b/tests/tt_eager/ops/test_bcast_op.cpp
index 3a331d8e226..dcef94c93a7 100644
--- a/tests/tt_eager/ops/test_bcast_op.cpp
+++ b/tests/tt_eager/ops/test_bcast_op.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/cpp/ttnn/operations/creation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include <magic_enum/magic_enum.hpp>
 #include <ttnn/operations/functions.hpp>
 
diff --git a/tests/tt_eager/ops/test_bmm_op.cpp b/tests/tt_eager/ops/test_bmm_op.cpp
index 286ca727baa..8e73545f435 100644
--- a/tests/tt_eager/ops/test_bmm_op.cpp
+++ b/tests/tt_eager/ops/test_bmm_op.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/cpp/ttnn/operations/creation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/operations/matmul/device/matmul_op.hpp"
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operations/functions.hpp"
 
 using namespace tt;
diff --git a/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp b/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp
index a79ec56ef08..a2ea8ff96d2 100644
--- a/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp
+++ b/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/assert.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/assert.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/cpp/ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/cpp/ttnn/tensor/types.hpp"
 #include "ttnn/tensor/host_buffer/functions.hpp"
diff --git a/tests/tt_eager/ops/test_eltwise_binary_op.cpp b/tests/tt_eager/ops/test_eltwise_binary_op.cpp
index 5b62dc878e9..e5251876b0a 100644
--- a/tests/tt_eager/ops/test_eltwise_binary_op.cpp
+++ b/tests/tt_eager/ops/test_eltwise_binary_op.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
diff --git a/tests/tt_eager/ops/test_eltwise_unary_op.cpp b/tests/tt_eager/ops/test_eltwise_unary_op.cpp
index e8ea5e37385..462979b0b38 100644
--- a/tests/tt_eager/ops/test_eltwise_unary_op.cpp
+++ b/tests/tt_eager/ops/test_eltwise_unary_op.cpp
@@ -4,7 +4,7 @@
 
 #include <cmath>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
@@ -12,7 +12,7 @@
 #include "ttnn/operations/eltwise/unary/device/unary_device_operation.hpp"
 #include "ttnn/operations/data_movement/pad/pad.hpp"
 #include "ttnn/operation.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 
 using tt::tt_metal::DataType;
diff --git a/tests/tt_eager/ops/test_fold_op.cpp b/tests/tt_eager/ops/test_fold_op.cpp
index 386b8f41f19..a2a5e8e2d7c 100644
--- a/tests/tt_eager/ops/test_fold_op.cpp
+++ b/tests/tt_eager/ops/test_fold_op.cpp
@@ -9,7 +9,7 @@
 
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/fold/fold.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tests/tt_eager/ops/test_layernorm_op.cpp b/tests/tt_eager/ops/test_layernorm_op.cpp
index 320b127eb3a..e6605614d0b 100644
--- a/tests/tt_eager/ops/test_layernorm_op.cpp
+++ b/tests/tt_eager/ops/test_layernorm_op.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/normalization/layernorm/layernorm.hpp"
 #include <ttnn/operations/functions.hpp>
diff --git a/tests/tt_eager/ops/test_pad_op.cpp b/tests/tt_eager/ops/test_pad_op.cpp
index 8a551740cf0..7d8619d0a42 100644
--- a/tests/tt_eager/ops/test_pad_op.cpp
+++ b/tests/tt_eager/ops/test_pad_op.cpp
@@ -4,12 +4,12 @@
 
 #include <cmath>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/data_movement/pad/pad.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 
 using tt::tt_metal::DataType;
diff --git a/tests/tt_eager/ops/test_sfpu.cpp b/tests/tt_eager/ops/test_sfpu.cpp
index cb857da85b7..b4e94afa4dc 100644
--- a/tests/tt_eager/ops/test_sfpu.cpp
+++ b/tests/tt_eager/ops/test_sfpu.cpp
@@ -10,10 +10,10 @@
 
 #include <magic_enum/magic_enum.hpp>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tests_common/sfpu_helper/sfpu_helper.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 // #include "tt_gdb/tt_gdb.hpp"
diff --git a/tests/tt_eager/ops/test_sliding_window_ops.cpp b/tests/tt_eager/ops/test_sliding_window_ops.cpp
index a0e62d2038d..0abdf062283 100644
--- a/tests/tt_eager/ops/test_sliding_window_ops.cpp
+++ b/tests/tt_eager/ops/test_sliding_window_ops.cpp
@@ -10,7 +10,7 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/sliding_window/reference_sliding_window.hpp"
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 #include "ttnn/tensor/types.hpp"
 
diff --git a/tests/tt_eager/ops/test_softmax_op.cpp b/tests/tt_eager/ops/test_softmax_op.cpp
index b843d54a856..c6b583ad5b5 100644
--- a/tests/tt_eager/ops/test_softmax_op.cpp
+++ b/tests/tt_eager/ops/test_softmax_op.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/normalization/softmax/softmax.hpp"
 #include "ttnn/operations/functions.hpp"
diff --git a/tests/tt_eager/ops/test_tilize_op.cpp b/tests/tt_eager/ops/test_tilize_op.cpp
index 731cc295fff..ca3d831c669 100644
--- a/tests/tt_eager/ops/test_tilize_op.cpp
+++ b/tests/tt_eager/ops/test_tilize_op.cpp
@@ -7,12 +7,12 @@
 #include <random>
 #include <ttnn/operations/functions.hpp>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/tilize/tilize.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 using namespace tt_metal;
diff --git a/tests/tt_eager/ops/test_tilize_op_channels_last.cpp b/tests/tt_eager/ops/test_tilize_op_channels_last.cpp
index 9d292fda6e8..be19f2cac10 100644
--- a/tests/tt_eager/ops/test_tilize_op_channels_last.cpp
+++ b/tests/tt_eager/ops/test_tilize_op_channels_last.cpp
@@ -6,12 +6,12 @@
 #include <functional>
 #include <random>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/tilize/tilize.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 
 using namespace tt;
diff --git a/tests/tt_eager/ops/test_tilize_zero_padding.cpp b/tests/tt_eager/ops/test_tilize_zero_padding.cpp
index 3ae2ecbd80f..6037c70de3e 100644
--- a/tests/tt_eager/ops/test_tilize_zero_padding.cpp
+++ b/tests/tt_eager/ops/test_tilize_zero_padding.cpp
@@ -6,12 +6,12 @@
 #include <functional>
 #include <random>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 
 using namespace tt;
diff --git a/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp b/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp
index 2181552b33b..26199d4e833 100644
--- a/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp
+++ b/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp
@@ -6,13 +6,13 @@
 #include <functional>
 #include <random>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/cpp/ttnn/operations/creation.hpp"
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 using namespace tt_metal;
diff --git a/tests/tt_eager/ops/test_transpose_op.cpp b/tests/tt_eager/ops/test_transpose_op.cpp
index 42f5a12d8ea..e157d30b86d 100644
--- a/tests/tt_eager/ops/test_transpose_op.cpp
+++ b/tests/tt_eager/ops/test_transpose_op.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/transpose/transpose.hpp"
 #include <ttnn/operations/functions.hpp>
diff --git a/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp b/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp
index 9749652d152..7b332af4e87 100644
--- a/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp
+++ b/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp
@@ -11,7 +11,7 @@
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/transpose/transpose.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 using namespace tt_metal;
diff --git a/tests/tt_eager/ops/test_transpose_wh_single_core.cpp b/tests/tt_eager/ops/test_transpose_wh_single_core.cpp
index 9749652d152..7b332af4e87 100644
--- a/tests/tt_eager/ops/test_transpose_wh_single_core.cpp
+++ b/tests/tt_eager/ops/test_transpose_wh_single_core.cpp
@@ -11,7 +11,7 @@
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/transpose/transpose.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 using namespace tt_metal;
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
index f720ed60cbc..557382809f7 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 
 # SPDX-License-Identifier: Apache-2.0
 
@@ -1092,3 +1092,31 @@ def test_transpose_hw_rm(shape, device):
     tt_output = ttnn.transpose(tt_input, 2, 3)
     tt_output = ttnn.to_torch(tt_output)
     assert_with_pcc(torch_output, tt_output, 0.9999)
+
+
+@skip_for_grayskull("Grayskull does not support float32")
+def test_transpose_16411(device):
+    torch.manual_seed(2005)
+    input_shape = (5, 3, 1, 1, 12, 8)
+    a = torch.rand(input_shape, dtype=torch.bfloat16)
+    p_b2 = torch.transpose(a, 1, 3)
+    p_b3 = torch.transpose(a, 1, 5)
+    p_c = torch.transpose(a, 0, 4)
+    p_c2 = torch.transpose(a, 1, 4)
+    p_c3 = torch.transpose(a, 2, 4)
+    p_c4 = torch.transpose(a, 3, 4)
+
+    b = ttnn.from_torch(a, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device)
+    b2 = ttnn.transpose(b, 1, 3)
+    b3 = ttnn.transpose(b, 1, 5)
+    c = ttnn.transpose(b, 0, 4)
+    c2 = ttnn.transpose(b, 1, 4)
+    c3 = ttnn.transpose(b, 2, 4)
+    c4 = ttnn.transpose(b, 3, 4)
+
+    assert_with_pcc(p_b2, ttnn.to_torch(b2), 0.9999)
+    assert_with_pcc(p_b3, ttnn.to_torch(b3), 0.9999)
+    assert_with_pcc(p_c, ttnn.to_torch(c), 0.9999)
+    assert_with_pcc(p_c2, ttnn.to_torch(c2), 0.9999)
+    assert_with_pcc(p_c3, ttnn.to_torch(c3), 0.9999)
+    assert_with_pcc(p_c4, ttnn.to_torch(c4), 0.9999)
diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
index 191e2308373..a7e3538efce 100644
--- a/tests/tt_eager/tensors/test_async_tensor_apis.cpp
+++ b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
@@ -4,8 +4,8 @@
 
 #include <chrono>
 
-#include "common/bfloat16.hpp"
-#include "common/constants.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/constants.hpp>
 #include "ttnn/cpp/ttnn/operations/creation.hpp"
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
@@ -13,7 +13,7 @@
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
diff --git a/tests/tt_eager/tensors/test_copy_and_move.cpp b/tests/tt_eager/tensors/test_copy_and_move.cpp
index 1b6b4e15b4e..96ba15e48a7 100644
--- a/tests/tt_eager/tensors/test_copy_and_move.cpp
+++ b/tests/tt_eager/tensors/test_copy_and_move.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/bfloat16.hpp"
-#include "common/constants.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/constants.hpp>
 #include "ttnn/cpp/ttnn/operations/creation.hpp"
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 
 using namespace tt;
diff --git a/tests/tt_eager/tensors/test_host_device_loopback.cpp b/tests/tt_eager/tensors/test_host_device_loopback.cpp
index c50b8ee06af..40d8886fcb9 100644
--- a/tests/tt_eager/tensors/test_host_device_loopback.cpp
+++ b/tests/tt_eager/tensors/test_host_device_loopback.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 
 using namespace tt;
diff --git a/tests/tt_eager/tensors/test_ranks.cpp b/tests/tt_eager/tensors/test_ranks.cpp
index 593ea923810..10dd0acbc77 100644
--- a/tests/tt_eager/tensors/test_ranks.cpp
+++ b/tests/tt_eager/tensors/test_ranks.cpp
@@ -6,13 +6,13 @@
 #include <functional>
 #include <random>
 
-#include "common/bfloat16.hpp"
-#include "common/constants.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 
 using namespace tt;
diff --git a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp
index 720a9fe0c93..bc68c910fe5 100644
--- a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp
+++ b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp
@@ -6,15 +6,15 @@
 #include <functional>
 #include <random>
 
-#include "common/bfloat16.hpp"
-#include "common/constants.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/functions.hpp"
 
 /*
diff --git a/tests/tt_metal/distributed/test_distributed.cpp b/tests/tt_metal/distributed/test_distributed.cpp
index 1556024c57c..5e87bb84f8d 100644
--- a/tests/tt_metal/distributed/test_distributed.cpp
+++ b/tests/tt_metal/distributed/test_distributed.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "tests/tt_metal/distributed/distributed_fixture.hpp"
-#include "tt_metal/distributed/system_mesh.hpp"
+#include <tt-metalium/system_mesh.hpp>
 
 namespace tt::tt_metal::distributed::test {
 
diff --git a/tests/tt_metal/distributed/test_mesh_workload.cpp b/tests/tt_metal/distributed/test_mesh_workload.cpp
index c746d6a80a3..43f9a17b8bf 100644
--- a/tests/tt_metal/distributed/test_mesh_workload.cpp
+++ b/tests/tt_metal/distributed/test_mesh_workload.cpp
@@ -6,9 +6,9 @@
 
 #include "tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp"
 #include "tests/tt_metal/distributed/distributed_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 namespace tt::tt_metal::distributed::test {
 
diff --git a/tests/tt_metal/llrt/test_libs/conv_pattern.hpp b/tests/tt_metal/llrt/test_libs/conv_pattern.hpp
index aeacc91ee23..e1144d75f9f 100644
--- a/tests/tt_metal/llrt/test_libs/conv_pattern.hpp
+++ b/tests/tt_metal/llrt/test_libs/conv_pattern.hpp
@@ -5,8 +5,8 @@
 #pragma once
 #include <map>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/assert.hpp>
+#include <tt-metalium/constants.hpp>
 using namespace tt::constants;
 class ConvParameters {
 public:
diff --git a/tests/tt_metal/test_utils/comparison.hpp b/tests/tt_metal/test_utils/comparison.hpp
index c9a70212d7e..131b21f10f2 100644
--- a/tests/tt_metal/test_utils/comparison.hpp
+++ b/tests/tt_metal/test_utils/comparison.hpp
@@ -7,7 +7,7 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 #include "tt_metal/test_utils/packing.hpp"
 
 namespace tt {
diff --git a/tests/tt_metal/test_utils/df/float32.hpp b/tests/tt_metal/test_utils/df/float32.hpp
index 4d09a53adfb..005c5e728c0 100644
--- a/tests/tt_metal/test_utils/df/float32.hpp
+++ b/tests/tt_metal/test_utils/df/float32.hpp
@@ -5,7 +5,7 @@
 #pragma once
 #include <iostream>
 
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 
 namespace tt::test_utils::df {
 
diff --git a/tests/tt_metal/test_utils/env_vars.hpp b/tests/tt_metal/test_utils/env_vars.hpp
index 9a9cfb4a818..95c891569de 100644
--- a/tests/tt_metal/test_utils/env_vars.hpp
+++ b/tests/tt_metal/test_utils/env_vars.hpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "common/utils.hpp"
+#include <tt-metalium/utils.hpp>
 
 #include "umd/device/device_api_metal.h"
 #include "umd/device/tt_cluster_descriptor.h"
diff --git a/tests/tt_metal/test_utils/packing.hpp b/tests/tt_metal/test_utils/packing.hpp
index d6babfb28c1..0e6a6195b15 100644
--- a/tests/tt_metal/test_utils/packing.hpp
+++ b/tests/tt_metal/test_utils/packing.hpp
@@ -6,7 +6,7 @@
 #include <algorithm>
 #include <random>
 
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 
 namespace tt {
 namespace test_utils {
diff --git a/tests/tt_metal/test_utils/print_helpers.hpp b/tests/tt_metal/test_utils/print_helpers.hpp
index 11fd471a406..96b00534adf 100644
--- a/tests/tt_metal/test_utils/print_helpers.hpp
+++ b/tests/tt_metal/test_utils/print_helpers.hpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "common/utils.hpp"
+#include <tt-metalium/utils.hpp>
 
 namespace tt {
 namespace test_utils {
diff --git a/tests/tt_metal/test_utils/stimulus.hpp b/tests/tt_metal/test_utils/stimulus.hpp
index 2747369a524..5d64ed4bfd9 100644
--- a/tests/tt_metal/test_utils/stimulus.hpp
+++ b/tests/tt_metal/test_utils/stimulus.hpp
@@ -6,7 +6,7 @@
 #include <algorithm>
 #include <random>
 
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 #include "tt_metal/test_utils/packing.hpp"
 
 namespace tt {
diff --git a/tests/tt_metal/test_utils/tilization.hpp b/tests/tt_metal/test_utils/tilization.hpp
index 3501273a771..12e3bd30179 100644
--- a/tests/tt_metal/test_utils/tilization.hpp
+++ b/tests/tt_metal/test_utils/tilization.hpp
@@ -6,7 +6,7 @@
 #include <algorithm>
 #include <random>
 
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 
 namespace tt {
 namespace test_utils {
diff --git a/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp b/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp
index 1600dd90159..23b5dcdfd79 100644
--- a/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp
+++ b/tests/tt_metal/tt_fabric/common/fabric_fixture.hpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
+#include <tt-metalium/tt_backend_api_types.hpp>
+#include <tt-metalium/rtoptions.hpp>
 
 namespace tt::tt_fabric {
 namespace fabric_router_tests {
diff --git a/tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp
index 9a9b23b41b9..425e05659f7 100644
--- a/tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp
+++ b/tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp
@@ -4,8 +4,8 @@
 
 #include <gtest/gtest.h>
 
-#include "host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tt_metal/impl/allocator/algorithms/free_list.hpp"
 
 // TODO: Add a variant with randomized allocations and deallocations
diff --git a/tests/tt_metal/tt_metal/api/allocator/test_free_list_opt_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_free_list_opt_allocator.cpp
index 6d4488afa54..233451b8714 100644
--- a/tests/tt_metal/tt_metal/api/allocator/test_free_list_opt_allocator.cpp
+++ b/tests/tt_metal/tt_metal/api/allocator/test_free_list_opt_allocator.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <gtest/gtest.h>
-#include "allocator/allocator.hpp"
+#include <tt-metalium/allocator.hpp>
 #include "tt_metal/impl/allocator/algorithms/free_list_opt.hpp"
 
 // UDL to convert integer literals to SI units
diff --git a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
index e37436fd3af..210e1de804a 100644
--- a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
+++ b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
@@ -5,8 +5,8 @@
 #include <gtest/gtest.h>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 // FIXME: ARCH_NAME specific
 #include "dev_mem_map.h"
diff --git a/tests/tt_metal/tt_metal/api/buffer_test_utils.hpp b/tests/tt_metal/tt_metal/api/buffer_test_utils.hpp
index 35d57990e28..1ca1a6fcc1f 100644
--- a/tests/tt_metal/tt_metal/api/buffer_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/api/buffer_test_utils.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 namespace tt::test::buffer::detail {
 inline void writeL1Backdoor(
diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/circular_buffer_test_utils.hpp b/tests/tt_metal/tt_metal/api/circular_buffer/circular_buffer_test_utils.hpp
index 3881549c1e9..e2720ba5820 100644
--- a/tests/tt_metal/tt_metal/api/circular_buffer/circular_buffer_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/api/circular_buffer/circular_buffer_test_utils.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::tt_metal;
 using namespace tt::tt_metal::detail;
diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp
index a04d7b9882c..31182b4cc74 100644
--- a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp
+++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp
@@ -5,10 +5,10 @@
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
 #include "circular_buffer_test_utils.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
-#include "tt_metal/hw/inc/circular_buffer_constants.h"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/circular_buffer_constants.h>
 
 using std::vector;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp
index 0f0a6a24e27..cf6b4e05607 100644
--- a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp
+++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp
@@ -5,9 +5,9 @@
 #include "circular_buffer_test_utils.hpp"
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/circular_buffer.hpp>
 
 using std::vector;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp
index f7eb0447e8d..7903d8c30d7 100644
--- a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp
+++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp
@@ -5,11 +5,11 @@
 #include "circular_buffer_test_utils.hpp"
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/circular_buffer.hpp>
 
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 #include "gtest/gtest.h"
 
diff --git a/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp
index 60e01428d0e..41072b94670 100644
--- a/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp
+++ b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <gtest/gtest.h>
-#include "host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "logger.hpp"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp b/tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp
index 40166b03f29..7f815d73d8d 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 class CoreCoordFixture : public ::testing::Test {
 protected:
diff --git a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp
index aadfe6a66a9..02680e55183 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "core_coord_fixture.hpp"
 
 namespace basic_tests::CoreRangeSet {
diff --git a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp
index 6a3eefbf03f..9e6b3f95f96 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp
@@ -6,7 +6,7 @@
 
 #include "core_coord_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace basic_tests::CoreRangeSet {
 
diff --git a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp
index fb1f406412f..41752c92049 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp
@@ -6,7 +6,7 @@
 
 #include "core_coord_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace basic_tests::CoreRangeSet {
 
diff --git a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp
index 0ac3261649a..47c6b60e5b3 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp
@@ -7,7 +7,7 @@
 
 #include "core_coord_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace basic_tests::CoreRangeSet {
 
diff --git a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp
index 9005c2fb4d3..beb6291da47 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "core_coord_fixture.hpp"
 
 namespace basic_tests::CoreRange {
diff --git a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp
index 905b59123fe..99456f114d6 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp
@@ -6,7 +6,7 @@
 
 #include "core_coord_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace basic_tests::CoreRange {
 
diff --git a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp
index 409bf123f26..f276a68e9cd 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp
@@ -4,7 +4,7 @@
 
 #include "core_coord_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace basic_tests::CoreRange {
 
diff --git a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp
index 0b4b7b1fbf8..0f494ff1915 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "core_coord_fixture.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp
index dce75d55d3b..cb52ede262e 100644
--- a/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "core_coord_fixture.hpp"
 
 namespace basic_tests::CoreRange {
diff --git a/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp b/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp
index 68d842ad71b..03b1d373157 100644
--- a/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp
+++ b/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp
@@ -5,10 +5,10 @@
 #include "gtest/gtest.h"
 
 #include "command_queue_fixture.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/common/scoped_timer.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/circular_buffer.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/api/test_banked.cpp b/tests/tt_metal/tt_metal/api/test_banked.cpp
index 6343db97458..8b1923da20c 100644
--- a/tests/tt_metal/tt_metal/api/test_banked.cpp
+++ b/tests/tt_metal/tt_metal/api/test_banked.cpp
@@ -4,10 +4,10 @@
 
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/api/test_buffer_region.cpp b/tests/tt_metal/tt_metal/api/test_buffer_region.cpp
index 9260f44763e..547895ebb44 100644
--- a/tests/tt_metal/tt_metal/api/test_buffer_region.cpp
+++ b/tests/tt_metal/tt_metal/api/test_buffer_region.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <memory>
-#include "buffers/buffer.hpp"
-#include "buffers/buffer_constants.hpp"
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/buffer_constants.hpp>
 #include "gtest/gtest.h"
 
 #include "device_fixture.hpp"
diff --git a/tests/tt_metal/tt_metal/api/test_direct.cpp b/tests/tt_metal/tt_metal/api/test_direct.cpp
index 9e8a1accdbc..06259caf930 100644
--- a/tests/tt_metal/tt_metal/api/test_direct.cpp
+++ b/tests/tt_metal/tt_metal/api/test_direct.cpp
@@ -9,8 +9,8 @@
 #include <random>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/api/test_dram.cpp b/tests/tt_metal/tt_metal/api/test_dram.cpp
index e6a477e85bd..07916a6adf7 100644
--- a/tests/tt_metal/tt_metal/api/test_dram.cpp
+++ b/tests/tt_metal/tt_metal/api/test_dram.cpp
@@ -3,13 +3,13 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "dispatch_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/command_queue.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/logger.hpp>
 
 using namespace tt;
 
diff --git a/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp
index e96276dcf40..4fa2432ad93 100644
--- a/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp
+++ b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp
@@ -3,13 +3,13 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "dispatch_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/command_queue.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/logger.hpp>
 
 using namespace tt;
 
diff --git a/tests/tt_metal/tt_metal/api/test_global_circular_buffers.cpp b/tests/tt_metal/tt_metal/api/test_global_circular_buffers.cpp
index 5793b492d61..2ced30a708d 100644
--- a/tests/tt_metal/tt_metal/api/test_global_circular_buffers.cpp
+++ b/tests/tt_metal/tt_metal/api/test_global_circular_buffers.cpp
@@ -7,11 +7,11 @@
 #include <vector>
 
 #include "device_fixture.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include <tt-metalium/global_circular_buffer.hpp>
 #include "tt_metal/include/tt_metal/program.hpp"
 
 TEST_F(DispatchFixture, TensixCreateGlobalCircularBuffers) {
diff --git a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
index c77427b3d22..f103c2cd150 100644
--- a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
+++ b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
@@ -7,10 +7,10 @@
 #include <vector>
 
 #include "device_fixture.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/global_semaphore.hpp>
 
 TEST_F(DispatchFixture, InitializeGlobalSemaphores) {
     CoreRangeSet cores(CoreRange({0, 0}, {1, 1}));
diff --git a/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp b/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp
index 4afcc133b4c..d4ef1f01f0e 100644
--- a/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp
+++ b/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "gtest/gtest.h"
-#include "common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "dispatch_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "compile_program_with_kernel_path_env_var_fixture.hpp"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/api/test_noc.cpp b/tests/tt_metal/tt_metal/api/test_noc.cpp
index 6cfbdfba1cb..f77f0265af3 100644
--- a/tests/tt_metal/tt_metal/api/test_noc.cpp
+++ b/tests/tt_metal/tt_metal/api/test_noc.cpp
@@ -5,8 +5,8 @@
 #include <gtest/gtest.h>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
 
 // FIXME: ARCH_NAME
diff --git a/tests/tt_metal/tt_metal/api/test_runtime_args.cpp b/tests/tt_metal/tt_metal/api/test_runtime_args.cpp
index cb57563ba37..50871466325 100644
--- a/tests/tt_metal/tt_metal/api/test_runtime_args.cpp
+++ b/tests/tt_metal/tt_metal/api/test_runtime_args.cpp
@@ -5,9 +5,9 @@
 #include <gtest/gtest.h>
 
 #include "device_fixture.hpp"
-#include "kernels/kernel.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/kernel.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/api/test_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_semaphores.cpp
index bdf1bed8d3c..9e6aa1e92dc 100644
--- a/tests/tt_metal/tt_metal/api/test_semaphores.cpp
+++ b/tests/tt_metal/tt_metal/api/test_semaphores.cpp
@@ -5,9 +5,9 @@
 #include <gtest/gtest.h>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp b/tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp
index bda564cb04d..155322b0bbe 100644
--- a/tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp
+++ b/tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp
@@ -4,9 +4,9 @@
 
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp
index 05fb3c14050..9ccfd5ce006 100644
--- a/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp
+++ b/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp
@@ -5,7 +5,7 @@
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
 #include "buffer_test_utils.hpp"
-#include "host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 
 using tt::tt_metal::IDevice;
diff --git a/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp
index 65adc6ae525..db301571c4e 100644
--- a/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp
+++ b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp
@@ -5,8 +5,8 @@
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
 #include "buffer_test_utils.hpp"
-#include "host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 
 using tt::tt_metal::IDevice;
diff --git a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp
index a286d7f3dc8..5a93b0037d1 100644
--- a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp
+++ b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp
@@ -9,8 +9,8 @@
 #include <random>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp b/tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp
index c9bdb82bb61..f60f8fd3fe8 100644
--- a/tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp
+++ b/tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp
@@ -4,7 +4,7 @@
 
 #include <gtest/gtest.h>
 #include <vector>
-#include "tt_metal/common/tilize_untilize.hpp"
+#include <tt-metalium/tilize_untilize.hpp>
 
 template <bool tilize_first, typename T>
 void tilize_untilize_helper(
diff --git a/tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp b/tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp
index cb497d7c0ad..364f021edec 100644
--- a/tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp
+++ b/tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp
@@ -4,8 +4,8 @@
 
 #include <sys/types.h>
 #include "gtest/gtest.h"
-#include "tt_metal/impl/dispatch/worker_config_buffer.hpp"
-#include "tt_metal/common/env_lib.hpp"
+#include <tt-metalium/worker_config_buffer.hpp>
+#include <tt-metalium/env_lib.hpp>
 
 #include <cstddef>
 #include <deque>
diff --git a/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp
index 61687f3878e..624500c5f02 100644
--- a/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp
+++ b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp
@@ -7,15 +7,15 @@
 #include "gtest/gtest.h"
 #include "dispatch_fixture.hpp"
 #include "hostdevcommon/common_values.hpp"
-#include "impl/device/device.hpp"
+#include <tt-metalium/device_impl.hpp>
 #include "umd/device/types/cluster_descriptor_types.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/kernel.hpp>
+#include <tt-metalium/tt_backend_api_types.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/llrt.hpp>
 
 class CommandQueueFixture : public DispatchFixture {
 protected:
diff --git a/tests/tt_metal/tt_metal/common/device_fixture.hpp b/tests/tt_metal/tt_metal/common/device_fixture.hpp
index 82ea46c11e8..95276cc0d5b 100644
--- a/tests/tt_metal/tt_metal/common/device_fixture.hpp
+++ b/tests/tt_metal/tt_metal/common/device_fixture.hpp
@@ -7,11 +7,11 @@
 #include <gtest/gtest.h>
 
 #include "dispatch_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/device_pool.hpp>
+#include <tt-metalium/llrt.hpp>
 
 class DeviceFixture : public DispatchFixture {
 protected:
diff --git a/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp
index 6dc25d3450f..aff501e63e0 100644
--- a/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp
+++ b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp
@@ -5,14 +5,14 @@
 #pragma once
 
 #include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/program/program.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/program_impl.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/device_pool.hpp>
+#include <tt-metalium/llrt.hpp>
 
 // A dispatch-agnostic test fixture
 class DispatchFixture : public ::testing::Test {
diff --git a/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp b/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp
index d05d7bc9fef..4d020ffb0de 100644
--- a/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp
@@ -4,13 +4,13 @@
 
 #pragma once
 
-#include "host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
-#include "tt_metal/common/test_tiles.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
 #include "hostdevcommon/common_values.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/llrt.hpp>
 
 using namespace tt;
 
diff --git a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp
index 92495a42eef..1314098aac3 100644
--- a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp
+++ b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp
@@ -6,12 +6,12 @@
 
 #include <gtest/gtest.h>
 
-#include "host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "dispatch_fixture.hpp"
 #include "umd/device/types/cluster_descriptor_types.h"
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/device_pool.hpp>
+#include <tt-metalium/llrt.hpp>
 
 class MultiDeviceFixture : public DispatchFixture {
 protected:
diff --git a/tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp b/tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp
index c90e6037e14..210e6a2b0e8 100644
--- a/tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "host_api.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/llrt.hpp>
 
 // Helper function to open a file as an fstream, and check that it was opened properly.
 inline bool OpenFile(string &file_name, std::fstream &file_stream, std::ios_base::openmode mode) {
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp
index 7aa55697a54..76aa22b6c2a 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp
@@ -5,8 +5,8 @@
 #include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
 #include "debug_tools_test_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 ////////////////////////////////////////////////////////////////////////////////
 // A test for printing from ethernet cores.
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp
index 3ff0294b2f9..16a924e2d66 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "gtest/gtest.h"
 #include "debug_tools_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/rtoptions.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking that the DPRINT server can detect an invalid core.
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp
index 4ae2a9a1bec..39e56b65786 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "debug_tools_fixture.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "debug_tools_test_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A simple test for checking that disabling dprints on a device won't cause a hang.
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp
index 5472b0636fa..df70497fe94 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
-#include "impl/debug/dprint_server.hpp"
+#include <tt-metalium/dprint_server.hpp>
 #include "debug_tools_test_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking that the DPRINT server can be muted/unmuted.
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp
index 971df9fb11f..1670cc7a7c9 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "debug_tools_fixture.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "gtest/gtest.h"
 #include "debug_tools_test_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A simple test for checking DPRINTs from all harts.
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_buffering.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_buffering.cpp
index 596be59c74a..7f7a313eaf9 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_buffering.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_buffering.cpp
@@ -7,8 +7,8 @@
 #include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
 #include "debug_tools_test_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 ////////////////////////////////////////////////////////////////////////////////
 // A test for checking that prints are properly buffered before being displayed to the user.
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp
index 4f42e7fbfef..f14300ced3c 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "debug_tools_fixture.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "gtest/gtest.h"
 #include "debug_tools_test_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking that we can handle an invalid WAIT command.
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_prepend_device_core_risc.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_prepend_device_core_risc.cpp
index 54837618e53..cb0db6bb9af 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_prepend_device_core_risc.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_prepend_device_core_risc.cpp
@@ -8,9 +8,9 @@
 #include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
 #include "debug_tools_test_utils.hpp"
-#include "kernels/kernel_types.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/kernel_types.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 ////////////////////////////////////////////////////////////////////////////////
 // A test for checking that prints are prepended with their corresponding device, core and RISC.
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp
index c8e93e0a195..d4bb78be769 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
 #include "debug_tools_test_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp
index 24ca66642c0..f7735cf8d5f 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp
@@ -4,8 +4,8 @@
 
 #include "debug_tools_fixture.hpp"
 #include "debug_tools_test_utils.hpp"
-#include "common/bfloat8.hpp"
-#include "common/bfloat4.hpp"
+#include <tt-metalium/bfloat8.hpp>
+#include <tt-metalium/bfloat4.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A simple test for checking DPRINTs from all harts.
diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp
index f66923a3659..6a708eb78c7 100644
--- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp
@@ -5,8 +5,8 @@
 #include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
 #include "debug_tools_test_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 ////////////////////////////////////////////////////////////////////////////////
 // A test for DPrint RAISE/WAIT between cores and riscs.
diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp
index ec378549286..a007bea36a8 100644
--- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp
@@ -4,14 +4,14 @@
 
 #include "debug_tools_fixture.hpp"
 #include "debug_tools_test_utils.hpp"
-#include "llrt/llrt.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/llrt.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 // Do we really want to expose Hal like this?
 // This looks like an API level test
-#include "llrt/hal.hpp"
+#include <tt-metalium/hal.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher NOC sanitization.
diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp
index 22615e7375b..2e1c4656767 100644
--- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "llrt/rtoptions.hpp"
+#include <tt-metalium/rtoptions.hpp>
 #include "debug_tools_fixture.hpp"
 #include "debug_tools_test_utils.hpp"
-#include "llrt/llrt.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/llrt.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher NOC sanitization.
diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp
index d51d22c3968..5a77b2442e9 100644
--- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp
@@ -4,8 +4,8 @@
 
 #include "debug_tools_fixture.hpp"
 #include "debug_tools_test_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher waypoints.
diff --git a/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp b/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp
index a2ffb7b03f4..ee367536a20 100644
--- a/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp
+++ b/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp
@@ -6,9 +6,9 @@
 
 #include <gtest/gtest.h>
 
-#include "host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device_pool.hpp>
 #include "multi_device_fixture.hpp"
 
 class GalaxyFixture : public MultiDeviceFixture {
diff --git a/tests/tt_metal/tt_metal/device/test_device.cpp b/tests/tt_metal/tt_metal/device/test_device.cpp
index 738f35cdf62..fd77065715c 100644
--- a/tests/tt_metal/tt_metal/device/test_device.cpp
+++ b/tests/tt_metal/tt_metal/device/test_device.cpp
@@ -5,8 +5,8 @@
 #include <gtest/gtest.h>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp
index 0b9ba68dddc..3b4b0a43cbe 100644
--- a/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp
+++ b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp
@@ -7,8 +7,8 @@
 #include <algorithm>
 
 #include "multi_device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 using namespace tt::test_utils;
diff --git a/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp
index 868bfd387da..7e7b4b10465 100644
--- a/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp
+++ b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp
@@ -4,12 +4,12 @@
 
 #include <gtest/gtest.h>
 
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/device_pool.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/device/test_device_pool.cpp b/tests/tt_metal/tt_metal/device/test_device_pool.cpp
index 9c204fc7abd..ebac25b04c8 100644
--- a/tests/tt_metal/tt_metal/device/test_device_pool.cpp
+++ b/tests/tt_metal/tt_metal/device/test_device_pool.cpp
@@ -4,8 +4,8 @@
 
 #include <gtest/gtest.h>
 
-#include "host_api.hpp"
-#include "impl/device/device_pool.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device_pool.hpp>
 
 using namespace tt;
 
diff --git a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp
index 3c3373f17c8..5a59b2c03f8 100644
--- a/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp
+++ b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp
@@ -5,8 +5,8 @@
 #include <gtest/gtest.h>
 
 #include "galaxy_fixture.hpp"
-#include "tt_metal/llrt/tt_cluster.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_cluster.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index b86cfb011d6..761e2559faa 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -9,9 +9,9 @@
 #include "multi_command_queue_fixture.hpp"
 #include "dispatch_test_utils.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
 
 using std::vector;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
index c205ea4a909..c88557298f8 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
@@ -9,11 +9,11 @@
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/event/event.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/global_semaphore.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/event.hpp>
+#include <tt-metalium/sub_device.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 #include "command_queue_fixture.hpp"
 
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp
index 6746e3705e3..1ba39ee1659 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp
@@ -5,12 +5,12 @@
 #include <memory>
 
 #include "multi_command_queue_fixture.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 #include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "dispatch_test_utils.hpp"
-#include "tt_metal/impl/event/event.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/event.hpp>
+#include <tt-metalium/device.hpp>
 
 using std::vector;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp
index c0f28c44dbe..6d67d2608db 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp
@@ -4,11 +4,11 @@
 
 #include "command_queue_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "impl/debug/watcher_server.hpp"
-#include "tt_metal/impl/event/event.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/event.hpp>
+#include <tt-metalium/command_queue.hpp>
 
 using std::vector;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp
index 761ce993eff..ea58b2f4f55 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp
@@ -9,12 +9,12 @@
 #include "random_program_fixture.hpp"
 #include "dispatch_test_utils.hpp"
 #include "gtest/gtest.h"
-#include "impl/buffers/buffer.hpp"
-#include "impl/device/device.hpp"
-#include "impl/kernels/kernel_types.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/kernel_types.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "umd/device/tt_soc_descriptor.h"
 
 // TODO: ARCH_NAME specific, must remove
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp
index 507c022ad7a..c06089e20b9 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp
@@ -4,13 +4,13 @@
 
 #include <gtest/gtest.h>
 
-#include "common/core_coord.hpp"
-#include "detail/tt_metal.hpp"
-#include "host_api.hpp"
-#include "impl/device/device.hpp"
-#include "impl/kernels/data_types.hpp"
-#include "impl/kernels/kernel_types.hpp"
-#include "impl/program/program.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/data_types.hpp>
+#include <tt-metalium/kernel_types.hpp>
+#include <tt-metalium/program_impl.hpp>
 #include "umd/device/types/cluster_descriptor_types.h"
 #include "program_with_kernel_created_from_string_fixture.hpp"
 
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp
index ff2aa8dedcc..12b858a5631 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 #include "gtest/gtest.h"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "host_api.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp
index 3c989d61d11..a7867ad2402 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp
@@ -9,11 +9,11 @@
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/event/event.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/global_semaphore.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/event.hpp>
+#include <tt-metalium/sub_device.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 #include "command_queue_fixture.hpp"
 #include "sub_device_test_utils.hpp"
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp b/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp
index 8d0dd5022c2..be77eb429a0 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 #include <cstdint>
-#include "host_api.hpp"
-#include "impl/kernels/kernel.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 
 struct TestBufferConfig {
     uint32_t num_pages;
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp
index 424a59e41fe..38409bd7a5e 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp
@@ -8,16 +8,16 @@
 #include "multi_command_queue_fixture.hpp"
 #include "random_program_fixture.hpp"
 #include "dispatch_test_utils.hpp"
-#include "detail/tt_metal.hpp"
-#include "tt_metal/common/env_lib.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/env_lib.hpp>
 #include "gtest/gtest.h"
-#include "tt_metal/impl/allocator/allocator.hpp"
-#include "tt_metal/impl/program/program.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/allocator.hpp>
+#include <tt-metalium/program_impl.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/logger.hpp>
 #include "tt_metal/common/scoped_timer.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp
index 2f2775f9f1f..aa18b3c3bfa 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp
@@ -9,11 +9,11 @@
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/event/event.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/global_semaphore.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/event.hpp>
+#include <tt-metalium/sub_device.hpp>
 #include "command_queue_fixture.hpp"
 #include "dispatch_test_utils.hpp"
 #include "sub_device_test_utils.hpp"
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp
index b7006f48a0e..3d0d96755fe 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_util/test_dispatch_settings.cpp
@@ -5,11 +5,11 @@
 #include <array>
 #include <stdexcept>
 #include "command_queue_fixture.hpp"
-#include "common/logger.hpp"
-#include "dispatch/dispatch_constants.hpp"
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/dispatch_constants.hpp>
 #include "gtest/gtest.h"
-#include "llrt/hal.hpp"
-#include "tt_metal/impl/dispatch/util/include/dispatch_settings.hpp"
+#include <tt-metalium/hal.hpp>
+#include <tt-metalium/dispatch_settings.hpp>
 #include "umd/device/tt_core_coordinates.h"
 
 using namespace tt::tt_metal::dispatch;
diff --git a/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp
index 5881edd4b07..fe398b8a90a 100644
--- a/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp
+++ b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp
@@ -7,15 +7,15 @@
 #include "gtest/gtest.h"
 #include "dispatch_fixture.hpp"
 #include "hostdevcommon/common_values.hpp"
-#include "impl/device/device.hpp"
-#include "llrt/hal.hpp"
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/hal.hpp>
 #include "umd/device/types/cluster_descriptor_types.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
+#include <tt-metalium/kernel.hpp>
+#include <tt-metalium/tt_backend_api_types.hpp>
+#include <tt-metalium/rtoptions.hpp>
 
 class MultiCommandQueueSingleDeviceFixture : public DispatchFixture {
 protected:
diff --git a/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp
index 8a9394fea33..ccef1c93422 100644
--- a/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp
+++ b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp
@@ -5,13 +5,13 @@
 #pragma once
 
 #include "command_queue_fixture.hpp"
-#include "impl/device/device.hpp"
-#include "llrt/hal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/hw/inc/circular_buffer_constants.h"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/hal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/circular_buffer_constants.h>
+#include <tt-metalium/kernel.hpp>
+#include <tt-metalium/tt_backend_api_types.hpp>
 #include "dispatch_test_utils.hpp"
 
 class RandomProgramFixture : virtual public CommandQueueSingleCardProgramFixture {
diff --git a/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp
index 98571ad2e83..3993268b440 100644
--- a/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 // TODO: ARCH_NAME specific, must remove
 #include "eth_l1_address_map.h"
diff --git a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp
index 368553d0674..8d20b2ee25b 100644
--- a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp
+++ b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp
@@ -8,11 +8,11 @@
 #include "device_fixture.hpp"
 #include "dispatch_fixture.hpp"
 #include "multi_device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/llrt.hpp>
 
 // TODO: ARCH_NAME specific, must remove
 #include "eth_l1_address_map.h"
diff --git a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp
index e5d6687d40b..e4d2c50869e 100644
--- a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp
+++ b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp
@@ -9,10 +9,10 @@
 #include "command_queue_fixture.hpp"
 #include "dispatch_fixture.hpp"
 #include "multi_device_fixture.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 
 // FIXME: ARCH_NAME
diff --git a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp
index 0869632a775..dce9a0a2ddb 100644
--- a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp
+++ b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp
@@ -12,9 +12,9 @@
 #include "dispatch_fixture.hpp"
 #include "multi_device_fixture.hpp"
 #include "command_queue_fixture.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 
 // TODO: ARCH_NAME specific, must remove
diff --git a/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp
index 497f3a82685..2d9d997c6c7 100644
--- a/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp
+++ b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp
@@ -11,9 +11,9 @@
 
 #include "device_fixture.hpp"
 #include "multi_device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp
index b1ee07b0405..a78af1fe2e3 100644
--- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp
@@ -3,12 +3,12 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dispatch_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
 #include "matmul_test_utils.hpp"
diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp
index 076fe783ba8..882bb99b2e6 100644
--- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dispatch_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "matmul_test_utils.hpp"
 
diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp
index 56d05266bb0..3af5d00bf05 100644
--- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp
@@ -5,12 +5,12 @@
 #include <algorithm>
 
 #include "dispatch_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "matmul_test_utils.hpp"
 
diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
index cea063c479d..742ff302d36 100644
--- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
@@ -5,11 +5,11 @@
 #include <algorithm>
 
 #include "dispatch_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "hostdevcommon/common_values.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "matmul_test_utils.hpp"
diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
index 5d029961636..29f4c15171b 100644
--- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
@@ -5,11 +5,11 @@
 #include <algorithm>
 
 #include "dispatch_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "hostdevcommon/common_values.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "matmul_test_utils.hpp"
diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp
index 4a8822ff38c..ce4eb8b6ebd 100644
--- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp
@@ -3,12 +3,12 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dispatch_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "tt_metal/test_utils/comparison.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
 #include "matmul_test_utils.hpp"
diff --git a/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp
index c18c0f1c400..73ab6d1a648 100644
--- a/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp
+++ b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp
@@ -13,20 +13,20 @@
 #include "gtest/gtest.h"
 #include "umd/device/types/arch.h"
 #include "command_queue_fixture.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "impl/device/device.hpp"
-#include "impl/buffers/circular_buffer.hpp"
-#include "impl/kernels/data_types.hpp"
-#include "impl/kernels/kernel_types.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/circular_buffer.hpp>
+#include <tt-metalium/data_types.hpp>
+#include <tt-metalium/kernel_types.hpp>
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/detail/persistent_kernel_cache.hpp"
+#include <tt-metalium/persistent_kernel_cache.hpp>
 #include "tt_metal/test_utils/stimulus.hpp"
 
 using tt::tt_metal::IDevice;
diff --git a/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp
index e4c9e5ae2e8..3b23e519152 100644
--- a/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp
+++ b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp
@@ -12,12 +12,12 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 #include <gtest/gtest.h>
 
-#include "tt_metal/common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "command_queue_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/device.hpp>
 
 using std::map;
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/integration/test_flatten.cpp b/tests/tt_metal/tt_metal/integration/test_flatten.cpp
index ccb0293afb7..0491c7b51ed 100644
--- a/tests/tt_metal/tt_metal/integration/test_flatten.cpp
+++ b/tests/tt_metal/tt_metal/integration/test_flatten.cpp
@@ -4,9 +4,9 @@
 
 #include "dispatch_fixture.hpp"
 #include "command_queue_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp
index b22643f0598..a899b4d3877 100644
--- a/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp
@@ -8,12 +8,12 @@
 #include <algorithm>
 
 #include "command_queue_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/device.hpp>
 
 using std::map;
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/integration/vecadd/test_vecadd_multi_core.cpp b/tests/tt_metal/tt_metal/integration/vecadd/test_vecadd_multi_core.cpp
index 5073e630989..8fd41c9d204 100644
--- a/tests/tt_metal/tt_metal/integration/vecadd/test_vecadd_multi_core.cpp
+++ b/tests/tt_metal/tt_metal/integration/vecadd/test_vecadd_multi_core.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "dispatch_fixture.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 
diff --git a/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp
index a90ea019fd2..b90be37bfa8 100644
--- a/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp
@@ -5,8 +5,8 @@
 #include <gtest/gtest.h>
 
 #include "device_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/llk/test_golden_impls.cpp b/tests/tt_metal/tt_metal/llk/test_golden_impls.cpp
index cd1cf225b1f..53fc8277e00 100644
--- a/tests/tt_metal/tt_metal/llk/test_golden_impls.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_golden_impls.cpp
@@ -5,10 +5,10 @@
 #include <algorithm>
 
 #include "test_golden_impls.hpp"
-#include "common/test_tiles.hpp"
-#include "common/bfloat16.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 #include "tests/tt_metal/test_utils/packing.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/llk/test_reconfig.cpp b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp
index 927337d1a19..89e188c2401 100644
--- a/tests/tt_metal/tt_metal/llk/test_reconfig.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp
@@ -4,7 +4,7 @@
 
 #include <variant>
 #include "device_fixture.hpp"
-#include "tt_metal/common/bfloat8.hpp"
+#include <tt-metalium/bfloat8.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/llk/test_reduce.cpp b/tests/tt_metal/tt_metal/llk/test_reduce.cpp
index f28d1424ee0..b9550e131f5 100644
--- a/tests/tt_metal/tt_metal/llk/test_reduce.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_reduce.cpp
@@ -10,15 +10,15 @@
 #include <random>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 #include "test_golden_impls.hpp"
-#include "common/test_tiles.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp
index 03834f83e47..19d4586ecbe 100644
--- a/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp
@@ -10,8 +10,8 @@
 #include <random>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp
index 3bf076941de..da7134d9ccf 100644
--- a/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp
@@ -10,8 +10,8 @@
 #include <bit>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp
index a0885b9ed92..39ef0dfe84d 100644
--- a/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp
@@ -8,10 +8,10 @@
 #include <functional>
 #include <random>
 
-#include "common/test_tiles.hpp"  // FIXME: Remove dependency on this or move to test_utils like tilize/untilize
+#include <tt-metalium/test_tiles.hpp>  // FIXME: Remove dependency on this or move to test_utils like tilize/untilize
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/llk/test_transpose.cpp b/tests/tt_metal/tt_metal/llk/test_transpose.cpp
index 81e48491ad1..b84e8eee148 100644
--- a/tests/tt_metal/tt_metal/llk/test_transpose.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_transpose.cpp
@@ -10,14 +10,14 @@
 #include <random>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 #include "test_golden_impls.hpp"
-#include "common/test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp
index aa4b4d172ee..3e2c47ef80b 100644
--- a/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp
@@ -10,8 +10,8 @@
 #include <random>
 
 #include "device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp b/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp
index b9d0e5dc552..99679f937b3 100644
--- a/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp
+++ b/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp
@@ -12,9 +12,9 @@
 #include "command_queue_fixture.hpp"
 #include "device_fixture.hpp"
 #include "multi_device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
index 38556bbd69f..ff359239b1e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
@@ -11,18 +11,18 @@
 #include <string>
 #include <vector>
 
-#include "common/bfloat8.hpp"
-#include "common/bfloat16.hpp"
-#include "common/tt_backend_api_types.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <tt-metalium/bfloat8.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_backend_api_types.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include <tt-metalium/global_semaphore.hpp>
+#include <tt-metalium/global_circular_buffer.hpp>
 #include "tt_metal/include/tt_metal/program.hpp"
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "tt_metal/tt_metal/common/matmul_test_utils.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
index 8c38cd415d1..16ceb8092cd 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
@@ -11,18 +11,18 @@
 #include <string>
 #include <vector>
 
-#include "common/bfloat8.hpp"
-#include "common/bfloat16.hpp"
-#include "common/tt_backend_api_types.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <tt-metalium/bfloat8.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_backend_api_types.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include <tt-metalium/global_semaphore.hpp>
+#include <tt-metalium/global_circular_buffer.hpp>
 #include "tt_metal/include/tt_metal/program.hpp"
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "tt_metal/tt_metal/common/matmul_test_utils.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
index ce29f675563..f258b0a7906 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
@@ -7,26 +7,26 @@
 #include <functional>
 #include <random>
 
-#include "common/bfloat16.hpp"
-#include "common/bfloat8.hpp"
-#include "test_tiles.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/bfloat8.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include <optional>
 
 #include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tt_metal/tt_metal/common/matmul_test_utils.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp
index 6e1288fc27b..09dd93c0569 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp
@@ -7,10 +7,10 @@
 #include <random>
 #include <string>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
index 79390fbb9f9..661d0018769 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
@@ -8,10 +8,10 @@
 #include <string>
 #include <vector>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp
index 0bda8ad8976..6160c1a49e8 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp
@@ -8,12 +8,12 @@
 #include <string>
 #include <vector>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
-#include "tt_metal/impl/dispatch/memcpy.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/command_queue_interface.hpp>
+#include <tt-metalium/memcpy.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
index a7a03f0de34..c27e073e868 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
@@ -8,10 +8,10 @@
 #include <string>
 #include <vector>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
index d2536cf4dad..f83ac652135 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
@@ -11,13 +11,13 @@
 #include <string>
 #include <vector>
 
-#include "common/bfloat16.hpp"
-#include "common/tt_backend_api_types.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_backend_api_types.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using namespace tt;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp
index cdb963f0ff9..9889aa430b9 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp
@@ -7,10 +7,10 @@
 #include <random>
 #include <vector>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
index 4e0eb2339a9..e4db33b2344 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -11,14 +11,14 @@
 #include <string>
 #include <vector>
 
-#include "common/bfloat8.hpp"
-#include "common/bfloat16.hpp"
-#include "common/tt_backend_api_types.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/bfloat8.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_backend_api_types.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include <yaml-cpp/yaml.h>
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
index 6d1cc134043..24f9fe77f78 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
@@ -12,15 +12,15 @@
 #include <vector>
 #include <ranges>
 
-#include "common/bfloat4.hpp"
-#include "common/bfloat8.hpp"
-#include "common/bfloat16.hpp"
-#include "common/tt_backend_api_types.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/bfloat4.hpp>
+#include <tt-metalium/bfloat8.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_backend_api_types.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include <yaml-cpp/yaml.h>
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
index cf1bafada5b..ded42802864 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
@@ -71,6 +71,9 @@ foreach(arch ${ARCHITECTURES})
                 yaml-cpp::yaml-cpp
                 tt_fabric
         )
+        if(${TEST_SRC} STREQUAL "dispatch/test_pgm_dispatch.cpp")
+            target_link_libraries(${TEST_TARGET} PRIVATE benchmark::benchmark)
+        endif()
         target_include_directories(
             ${TEST_TARGET}
             BEFORE
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp
index 82b6addea9d..275b7420663 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp
@@ -8,11 +8,11 @@
 
 #include <vector>
 
-#include "tt_metal/device.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "hostdevcommon/dprint_common.h"
-#include "llrt/hal.hpp"
-#include "llrt/llrt.hpp"
+#include <tt-metalium/hal.hpp>
+#include <tt-metalium/llrt.hpp>
 
 inline uint64_t get_t0_to_any_riscfw_end_cycle(tt::tt_metal::IDevice* device, const tt::tt_metal::Program& program) {
 #if defined(TRACY_ENABLE)
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
index 808eb020896..da1d0edacfc 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
@@ -7,14 +7,14 @@
 #include <cstdint>
 #include <unordered_map>
 #include "core_coord.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "noc/noc_parameters.h"
 
-#include "tt_metal/llrt/hal.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/hal.hpp>
+#include <tt-metalium/llrt.hpp>
 
 extern bool debug_g;
 extern bool use_coherent_data_g;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.py b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.py
new file mode 100755
index 00000000000..abf8f0cf912
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import json
+import os
+import sys
+import pathlib
+
+golden = json.load(
+    open(
+        os.path.join(
+            pathlib.Path(__file__).parent.resolve(),
+            "pgm_dispatch_golden.json",
+        ),
+        "r",
+    )
+)
+
+THRESHOLD = 4
+
+parser = argparse.ArgumentParser(description="Compare benchmark JSON to golden")
+parser.add_argument("json", help="JSON file to compare", type=argparse.FileType("r"))
+args = parser.parse_args()
+
+result = json.load(args.json)
+
+golden_benchmarks = {}
+for benchmark in golden["benchmarks"]:
+    golden_benchmarks[benchmark["name"]] = benchmark
+
+result_benchmarks = {}
+for benchmark in result["benchmarks"]:
+    result_benchmarks[benchmark["name"]] = benchmark
+
+exit_code = 0
+
+for name, benchmark in golden_benchmarks.items():
+    if name not in result_benchmarks:
+        print(f"Error: Golden benchmark {name} missing from results")
+        exit_code = 1
+        continue
+    result = result_benchmarks[benchmark["name"]]
+
+    if "error_occurred" in benchmark:
+        if "error_occurred" not in result:
+            result_time = result["IterationTime"] * 1000000
+            print(f"Consider adjusting baselines. Error in {name} was fixed in result (with time {result_time:.2f}us).")
+        continue
+
+    if "error_occurred" in result:
+        if "error_occurred" not in benchmark:
+            print(f"Error: Benchmark {name} gave unexpected error: {result['error_message']}")
+            exit_code = 1
+        continue
+
+    golden_time = benchmark["IterationTime"] * 1000000
+    result_time = result["IterationTime"] * 1000000
+    if result_time / golden_time > (1 + THRESHOLD / 100):
+        print(f"Error:Test {name} expected value {golden_time:.2f}us but got {result_time:.2f}us")
+        exit_code = 1
+    if golden_time / result_time > (1 + THRESHOLD / 100):
+        print(
+            f"Consider adjusting baselines. Test {name} got value {result_time:.2f}us but expected {golden_time:.2f}us."
+        )
+
+for name in result_benchmarks:
+    if name not in golden_benchmarks:
+        print(f"Error: Result benchmark {name} missing from goldens")
+        exit_code = 1
+
+if exit_code == 0:
+    print("Test successful")
+sys.exit(exit_code)
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.sh b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.sh
deleted file mode 100755
index c04927bf0c8..00000000000
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-
-LOG_FILE1="$TT_METAL_HOME/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.log"
-LOG_FILE2="results.log"
-
-# Run the pgm dispatch sweep with trace mode
-cd $TT_METAL_HOME
-./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_pgm_dispatch.sh --trace | tee log
-./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/filt_pgm_dispatch.pl log > $LOG_FILE2
-
-THRESHOLD=4 # Percentage difference threshold
-
-# Check if log files exist
-if [[ ! -f "$LOG_FILE1" || ! -f "$LOG_FILE2" ]]; then
-    echo "Error: One or both log files do not exist."
-    exit 1
-fi
-
-# Read and compare values from the log files
-line_count=0
-exit_code=0
-while IFS= read -r line1 && IFS= read -r line2 <&3; do
-    # Convert commas to newlines to handle both formats
-    values1=($(echo "$line1" | tr ',' '\n'))
-    values2=($(echo "$line2" | tr ',' '\n'))
-
-    # Iterate through values
-    for i in "${!values1[@]}"; do
-        value1="${values1[$i]}"
-        value2="${values2[$i]}"
-
-        # Check if both values are numeric
-        if [[ -z "$value1" || -z "$value2" || ! "$value1" =~ ^[0-9]+(\.[0-9]+)?$ || ! "$value2" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
-            echo "Got invalid numeric value in output, check if all pgm dispatch tests ran properly."
-            cat $LOG_FILE2
-            exit 1
-        fi
-        if (( $(echo "$value2 < $value1" | bc -l) )); then
-          echo "Line $line_count test $i got $value2 which is lower than expected $value1, consider updating $LOG_FILE1"
-        fi
-        # Calculate the percentage difference
-        if (( $(echo "$value1 != 0" | bc -l) )); then
-            percentage_diff=$(echo "scale=2; 100 * (($value2 - $value1) / $value1)" | bc)
-            percentage_diff="${percentage_diff#-}"
-        else
-            continue
-        fi
-
-        # Check if the percentage difference exceeds the threshold
-        if (( $(echo "$percentage_diff > $THRESHOLD" | bc -l) )); then
-            echo "Error: Line $line_count test $i expected value $value1 but got $value2"
-            exit_code=1
-        fi
-    done
-    line_count=$((line_count+1))
-done < "$LOG_FILE1" 3< "$LOG_FILE2"
-
-echo "### Raw golden log ###"
-cat $LOG_FILE2
-
-exit $exit_code
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/json_to_csv.py b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/json_to_csv.py
new file mode 100755
index 00000000000..c183c2d7a67
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/json_to_csv.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import argparse
+
+parser = argparse.ArgumentParser(description="Convert benchmark JSON to CSV")
+parser.add_argument("json", help="JSON file to convert", type=argparse.FileType("r"))
+args = parser.parse_args()
+
+result = json.load(args.json)
+print("name,time (us)")
+for benchmark in result["benchmarks"]:
+    print(f"{benchmark['name']},{benchmark['IterationTime']*1000000:.2f}")
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json
new file mode 100644
index 00000000000..6163167146d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json
@@ -0,0 +1,2202 @@
+{
+  "context": {
+    "date": "2025-01-09T04:11:21+00:00",
+    "host_name": "tt-metal-ci-vm-113",
+    "executable": "build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch",
+    "num_cpus": 14,
+    "mhz_per_cpu": 3000,
+    "cpu_scaling_enabled": false,
+    "caches": [
+      {
+        "type": "Data",
+        "level": 1,
+        "size": 32768,
+        "num_sharing": 1
+      },
+      {
+        "type": "Instruction",
+        "level": 1,
+        "size": 32768,
+        "num_sharing": 1
+      },
+      {
+        "type": "Unified",
+        "level": 2,
+        "size": 524288,
+        "num_sharing": 1
+      },
+      {
+        "type": "Unified",
+        "level": 3,
+        "size": 16777216,
+        "num_sharing": 1
+      }
+    ],
+    "load_avg": [3.94,3.54,3.45],
+    "library_version": "v1.9.1",
+    "library_build_type": "debug",
+    "json_schema_version": 1
+  },
+  "benchmarks": [
+    {
+      "name": "BM_pgm_dispatch/brisc_only_trace/256/manual_time",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/brisc_only_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 26,
+      "real_time": 2.6601730769230776e+07,
+      "cpu_time": 2.6529615384615623e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.6601730769230771e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_only_trace/512/manual_time",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/brisc_only_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 26,
+      "real_time": 2.6662653846153855e+07,
+      "cpu_time": 2.9320423076923271e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.6662653846153855e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_only_trace/1024/manual_time",
+      "family_index": 0,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/brisc_only_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 26,
+      "real_time": 2.7040115384615388e+07,
+      "cpu_time": 2.6867692307692872e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.7040115384615389e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_only_trace/2048/manual_time",
+      "family_index": 0,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/brisc_only_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 25,
+      "real_time": 2.7484519999999996e+07,
+      "cpu_time": 3.2514400000001053e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.7484519999999995e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_only_trace/4096/manual_time",
+      "family_index": 0,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/brisc_only_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 24,
+      "real_time": 2.9502333333333332e+07,
+      "cpu_time": 3.1982499999999582e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.9502333333333336e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_only_trace/8192/manual_time",
+      "family_index": 0,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/brisc_only_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 22,
+      "real_time": 3.2264636363636371e+07,
+      "cpu_time": 3.0213227272727047e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.2264636363636365e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_only_trace/12288/manual_time",
+      "family_index": 0,
+      "per_family_instance_index": 6,
+      "run_name": "BM_pgm_dispatch/brisc_only_trace/12288/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.5457300000000007e+07,
+      "cpu_time": 2.7025499999999702e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.5457300000000010e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/ncrisc_only_trace/256/manual_time",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/ncrisc_only_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 26,
+      "real_time": 2.6600615384615388e+07,
+      "cpu_time": 2.6969230769229129e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.6600615384615387e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/ncrisc_only_trace/512/manual_time",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/ncrisc_only_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 26,
+      "real_time": 2.6661999999999996e+07,
+      "cpu_time": 2.9010769230769078e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.6662000000000000e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/ncrisc_only_trace/1024/manual_time",
+      "family_index": 1,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/ncrisc_only_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 26,
+      "real_time": 2.7037230769230768e+07,
+      "cpu_time": 2.8008461538459465e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.7037230769230769e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/ncrisc_only_trace/2048/manual_time",
+      "family_index": 1,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/ncrisc_only_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 25,
+      "real_time": 2.7480999999999996e+07,
+      "cpu_time": 2.6100800000001811e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.7481000000000000e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/ncrisc_only_trace/4096/manual_time",
+      "family_index": 1,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/ncrisc_only_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 24,
+      "real_time": 2.9546208333333332e+07,
+      "cpu_time": 2.9264208333323884e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.9546208333333334e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/ncrisc_only_trace/8192/manual_time",
+      "family_index": 1,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/ncrisc_only_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 22,
+      "real_time": 3.2266181818181813e+07,
+      "cpu_time": 3.3103636363632482e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.2266181818181812e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/ncrisc_only_trace/12288/manual_time",
+      "family_index": 1,
+      "per_family_instance_index": 6,
+      "run_name": "BM_pgm_dispatch/ncrisc_only_trace/12288/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.5458250000000000e+07,
+      "cpu_time": 2.8790499999997719e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.5458250000000005e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/trisc_only_trace/256/manual_time",
+      "family_index": 2,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/trisc_only_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 24,
+      "real_time": 2.9081833333333343e+07,
+      "cpu_time": 3.2849583333327544e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.9081833333333341e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/trisc_only_trace/512/manual_time",
+      "family_index": 2,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/trisc_only_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 24,
+      "real_time": 2.9061874999999996e+07,
+      "cpu_time": 2.3948333333329021e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.9061875000000002e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/trisc_only_trace/1024/manual_time",
+      "family_index": 2,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/trisc_only_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 24,
+      "real_time": 2.9620666666666668e+07,
+      "cpu_time": 1.8642083333334875e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.9620666666666670e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/trisc_only_trace/2048/manual_time",
+      "family_index": 2,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/trisc_only_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 22,
+      "real_time": 3.1900636363636371e+07,
+      "cpu_time": 1.9765909090908481e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.1900636363636371e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/trisc_only_trace/4096/manual_time",
+      "family_index": 2,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/trisc_only_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 19,
+      "real_time": 3.6285473684210517e+07,
+      "cpu_time": 1.9697894736834136e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.6285473684210520e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/trisc_only_trace/8192/manual_time",
+      "family_index": 2,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/trisc_only_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 16,
+      "real_time": 4.4888687500000000e+07,
+      "cpu_time": 2.0452499999989992e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.4888687500000007e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/trisc_only_trace/12288/manual_time",
+      "family_index": 2,
+      "per_family_instance_index": 6,
+      "run_name": "BM_pgm_dispatch/trisc_only_trace/12288/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 13,
+      "real_time": 5.3077538461538471e+07,
+      "cpu_time": 2.0974615384613451e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.3077538461538473e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_trisc_only_trace/256/manual_time",
+      "family_index": 3,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/brisc_trisc_only_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 23,
+      "real_time": 2.9786217391304359e+07,
+      "cpu_time": 1.8034347826086880e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.9786217391304358e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_trisc_only_trace/512/manual_time",
+      "family_index": 3,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/brisc_trisc_only_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 23,
+      "real_time": 2.9976434782608692e+07,
+      "cpu_time": 1.8719999999993815e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.9976434782608691e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_trisc_only_trace/1024/manual_time",
+      "family_index": 3,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/brisc_trisc_only_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 22,
+      "real_time": 3.1497181818181824e+07,
+      "cpu_time": 1.7748636363622660e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.1497181818181825e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_trisc_only_trace/2048/manual_time",
+      "family_index": 3,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/brisc_trisc_only_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 21,
+      "real_time": 3.3541809523809530e+07,
+      "cpu_time": 1.9074285714290727e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.3541809523809530e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_trisc_only_trace/4096/manual_time",
+      "family_index": 3,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/brisc_trisc_only_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.9276611111111104e+07,
+      "cpu_time": 1.9366111111097776e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.9276611111111110e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_trisc_only_trace/8192/manual_time",
+      "family_index": 3,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/brisc_trisc_only_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 14,
+      "real_time": 5.0548714285714269e+07,
+      "cpu_time": 2.2192857142856934e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.0548714285714268e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/brisc_trisc_only_trace/12288/manual_time",
+      "family_index": 3,
+      "per_family_instance_index": 6,
+      "run_name": "BM_pgm_dispatch/brisc_trisc_only_trace/12288/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 11,
+      "real_time": 6.1927818181818195e+07,
+      "cpu_time": 2.0989090909105038e+04,
+      "time_unit": "ns",
+      "IterationTime": 6.1927818181818191e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_trace/256/manual_time",
+      "family_index": 4,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/all_processors_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 23,
+      "real_time": 3.1035782608695649e+07,
+      "cpu_time": 2.0630000000009397e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.1035782608695646e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_trace/512/manual_time",
+      "family_index": 4,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/all_processors_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 22,
+      "real_time": 3.1509136363636360e+07,
+      "cpu_time": 1.9522272727279244e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.1509136363636356e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_trace/1024/manual_time",
+      "family_index": 4,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/all_processors_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 21,
+      "real_time": 3.3216047619047623e+07,
+      "cpu_time": 2.0791904761896520e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.3216047619047624e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_trace/2048/manual_time",
+      "family_index": 4,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/all_processors_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.5498700000000007e+07,
+      "cpu_time": 2.3626999999981635e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.5498700000000001e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_trace/4096/manual_time",
+      "family_index": 4,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/all_processors_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 16,
+      "real_time": 4.2882687499999993e+07,
+      "cpu_time": 2.0096874999986358e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.2882687499999996e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_trace/8192/manual_time",
+      "family_index": 4,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/all_processors_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 5.7862833333333336e+07,
+      "cpu_time": 2.1122500000007411e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.7862833333333324e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_trace/12288/manual_time",
+      "family_index": 4,
+      "per_family_instance_index": 6,
+      "run_name": "BM_pgm_dispatch/all_processors_trace/12288/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 10,
+      "real_time": 7.0889399999999985e+07,
+      "cpu_time": 2.2058999999963191e+04,
+      "time_unit": "ns",
+      "IterationTime": 7.0889399999999977e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_trace/256/manual_time",
+      "family_index": 5,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 22,
+      "real_time": 3.1258272727272723e+07,
+      "cpu_time": 1.9471363636376765e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.1258272727272727e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_trace/512/manual_time",
+      "family_index": 5,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 22,
+      "real_time": 3.1511727272727266e+07,
+      "cpu_time": 2.0960454545461038e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.1511727272727267e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_trace/1024/manual_time",
+      "family_index": 5,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 21,
+      "real_time": 3.3215523809523821e+07,
+      "cpu_time": 1.9576714285715218e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.3215523809523816e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_trace/2048/manual_time",
+      "family_index": 5,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.5499300000000000e+07,
+      "cpu_time": 2.0137499999983709e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.5499299999999997e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_trace/4096/manual_time",
+      "family_index": 5,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 16,
+      "real_time": 4.2964999999999993e+07,
+      "cpu_time": 1.9405624999996318e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.2964999999999986e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_trace/8192/manual_time",
+      "family_index": 5,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 5.7888499999999993e+07,
+      "cpu_time": 2.1857583333321589e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.7888499999999988e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_trace/12288/manual_time",
+      "family_index": 5,
+      "per_family_instance_index": 6,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_trace/12288/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 10,
+      "real_time": 7.1722800000000000e+07,
+      "cpu_time": 2.2666000000004518e+04,
+      "time_unit": "ns",
+      "IterationTime": 7.1722800000000011e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/256/manual_time",
+      "family_index": 6,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1cb/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.4409450000000000e+07,
+      "cpu_time": 2.1440000000017004e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.4409450000000001e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/512/manual_time",
+      "family_index": 6,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1cb/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.4891450000000000e+07,
+      "cpu_time": 2.0419500000001812e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.4891449999999999e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/1024/manual_time",
+      "family_index": 6,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1cb/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 19,
+      "real_time": 3.6722578947368421e+07,
+      "cpu_time": 2.3342105263171747e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.6722578947368423e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/2048/manual_time",
+      "family_index": 6,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1cb/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.8818166666666672e+07,
+      "cpu_time": 1.7538333333326031e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.8818166666666674e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/4096/manual_time",
+      "family_index": 6,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1cb/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 15,
+      "real_time": 4.6130466666666672e+07,
+      "cpu_time": 2.2618666666692396e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.6130466666666677e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1cb/8192/manual_time",
+      "family_index": 6,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1cb/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 6.0477666666666657e+07,
+      "cpu_time": 2.2935833333311468e+04,
+      "time_unit": "ns",
+      "IterationTime": 6.0477666666666665e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/256/manual_time",
+      "family_index": 7,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32cb/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.4445250000000007e+07,
+      "cpu_time": 2.0422500000005784e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.4445250000000003e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/512/manual_time",
+      "family_index": 7,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32cb/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.4943850000000000e+07,
+      "cpu_time": 2.3140050000014511e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.4943850000000002e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/1024/manual_time",
+      "family_index": 7,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32cb/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 19,
+      "real_time": 3.6802842105263159e+07,
+      "cpu_time": 2.4169473684197914e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.6802842105263156e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/2048/manual_time",
+      "family_index": 7,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32cb/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.8862111111111112e+07,
+      "cpu_time": 2.2654444444479715e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.8862111111111116e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/4096/manual_time",
+      "family_index": 7,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32cb/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 15,
+      "real_time": 4.6398400000000000e+07,
+      "cpu_time": 2.4521333333341980e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.6398400000000003e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32cb/8192/manual_time",
+      "family_index": 7,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32cb/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 10,
+      "real_time": 7.0176900000000000e+07,
+      "cpu_time": 3.7319000000035630e+04,
+      "time_unit": "ns",
+      "IterationTime": 7.0176900000000007e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/256/manual_time",
+      "family_index": 8,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/all_processors_1_core_1_rta/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.4329300000000000e+07,
+      "cpu_time": 2.6831500000001899e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.4329300000000006e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/512/manual_time",
+      "family_index": 8,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/all_processors_1_core_1_rta/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 20,
+      "real_time": 3.4846250000000000e+07,
+      "cpu_time": 2.3221500000003558e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.4846250000000005e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/1024/manual_time",
+      "family_index": 8,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/all_processors_1_core_1_rta/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 19,
+      "real_time": 3.6379210526315793e+07,
+      "cpu_time": 2.5060526315784078e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.6379210526315792e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/2048/manual_time",
+      "family_index": 8,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/all_processors_1_core_1_rta/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.8741000000000000e+07,
+      "cpu_time": 2.9632777777788273e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.8740999999999999e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/4096/manual_time",
+      "family_index": 8,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/all_processors_1_core_1_rta/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 15,
+      "real_time": 4.6045733333333328e+07,
+      "cpu_time": 2.5187999999983407e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.6045733333333325e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_1_core_1_rta/8192/manual_time",
+      "family_index": 8,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/all_processors_1_core_1_rta/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 6.0181916666666657e+07,
+      "cpu_time": 2.5623333333335551e+04,
+      "time_unit": "ns",
+      "IterationTime": 6.0181916666666661e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/256/manual_time",
+      "family_index": 9,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 5.6965916666666664e+07,
+      "cpu_time": 3.0022499999985779e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.6965916666666665e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/512/manual_time",
+      "family_index": 9,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 5.7102916666666664e+07,
+      "cpu_time": 2.4605083333314091e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.7102916666666668e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/1024/manual_time",
+      "family_index": 9,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 5.7337416666666664e+07,
+      "cpu_time": 2.5089166666658613e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.7337416666666674e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/2048/manual_time",
+      "family_index": 9,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 5.7963250000000000e+07,
+      "cpu_time": 2.3686666666744572e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.7963249999999999e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/4096/manual_time",
+      "family_index": 9,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 6.0335916666666679e+07,
+      "cpu_time": 2.6460000000098444e+04,
+      "time_unit": "ns",
+      "IterationTime": 6.0335916666666668e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/8192/manual_time",
+      "family_index": 9,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/one_processor_all_cores_128_rta/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 11,
+      "real_time": 6.3152999999999993e+07,
+      "cpu_time": 2.5967272727230167e+04,
+      "time_unit": "ns",
+      "IterationTime": 6.3152999999999986e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/256/manual_time",
+      "family_index": 10,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.0664705882352926e+07,
+      "cpu_time": 2.3229411764697928e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.0664705882352934e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/512/manual_time",
+      "family_index": 10,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.0751000000000000e+07,
+      "cpu_time": 1.8595294117590092e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.0751000000000002e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/1024/manual_time",
+      "family_index": 10,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.0919647058823526e+07,
+      "cpu_time": 1.9589411764667708e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.0919647058823527e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/2048/manual_time",
+      "family_index": 10,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.1250000000000000e+07,
+      "cpu_time": 1.8734117647098588e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.1249999999999995e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/4096/manual_time",
+      "family_index": 10,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.2031705882352941e+07,
+      "cpu_time": 1.9504705882307109e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.2031705882352942e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/8192/manual_time",
+      "family_index": 10,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/one_processors_all_cores_1_rta/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 16,
+      "real_time": 4.3414874999999993e+07,
+      "cpu_time": 2.0875000000031287e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.3414874999999990e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/256/manual_time",
+      "family_index": 11,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 16,
+      "real_time": 4.4419812500000000e+07,
+      "cpu_time": 2.2358125000065953e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.4419812500000004e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/512/manual_time",
+      "family_index": 11,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 16,
+      "real_time": 4.4587125000000015e+07,
+      "cpu_time": 3.0499374999992669e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.4587125000000011e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/1024/manual_time",
+      "family_index": 11,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 16,
+      "real_time": 4.5051875000000000e+07,
+      "cpu_time": 2.8043187500048640e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.5051874999999996e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/2048/manual_time",
+      "family_index": 11,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 15,
+      "real_time": 4.6239599999999993e+07,
+      "cpu_time": 2.4698000000000775e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.6239599999999990e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/4096/manual_time",
+      "family_index": 11,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 14,
+      "real_time": 5.0309142857142866e+07,
+      "cpu_time": 2.8640714285670703e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.0309142857142864e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/8192/manual_time",
+      "family_index": 11,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_1_rta/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 9,
+      "real_time": 7.5753333333333328e+07,
+      "cpu_time": 2.9659999999943746e+04,
+      "time_unit": "ns",
+      "IterationTime": 7.5753333333333318e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/256/manual_time",
+      "family_index": 12,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 13,
+      "real_time": 5.5158076923076920e+07,
+      "cpu_time": 3.0777692307637495e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.5158076923076925e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/512/manual_time",
+      "family_index": 12,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 13,
+      "real_time": 5.5646153846153855e+07,
+      "cpu_time": 2.8697769230790782e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.5646153846153855e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/1024/manual_time",
+      "family_index": 12,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 5.8108750000000007e+07,
+      "cpu_time": 3.1109166666561567e+04,
+      "time_unit": "ns",
+      "IterationTime": 5.8108750000000005e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/2048/manual_time",
+      "family_index": 12,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 12,
+      "real_time": 6.0329833333333336e+07,
+      "cpu_time": 2.0452499999971489e+04,
+      "time_unit": "ns",
+      "IterationTime": 6.0329833333333338e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/4096/manual_time",
+      "family_index": 12,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 11,
+      "real_time": 6.6633000000000000e+07,
+      "cpu_time": 3.5582727272786098e+04,
+      "time_unit": "ns",
+      "IterationTime": 6.6633000000000002e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/8192/manual_time",
+      "family_index": 12,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_32_rta/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 9,
+      "real_time": 8.1320000000000015e+07,
+      "cpu_time": 4.0972222222200748e+04,
+      "time_unit": "ns",
+      "IterationTime": 8.1320000000000011e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/256/manual_time",
+      "family_index": 13,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.1741200000000000e+08,
+      "cpu_time": 4.6771833333162744e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1741199999999999e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/512/manual_time",
+      "family_index": 13,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.1782149999999999e+08,
+      "cpu_time": 2.9501666666931214e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1782149999999999e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/1024/manual_time",
+      "family_index": 13,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.1959916666666667e+08,
+      "cpu_time": 7.2574999999612060e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1959916666666667e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/2048/manual_time",
+      "family_index": 13,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.2209416666666667e+08,
+      "cpu_time": 3.8238333333495691e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.2209416666666667e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/4096/manual_time",
+      "family_index": 13,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 5,
+      "real_time": 1.2919520000000000e+08,
+      "cpu_time": 4.1432000000440894e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.2919520000000000e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/8192/manual_time",
+      "family_index": 13,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/all_processors_all_cores_128_rta/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 5,
+      "real_time": 1.4323059999999997e+08,
+      "cpu_time": 5.3293999999937114e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.4323059999999999e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/256/manual_time",
+      "family_index": 14,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.9846777777777776e+07,
+      "cpu_time": 3.2133888888847334e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.9846777777777773e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/512/manual_time",
+      "family_index": 14,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.9938888888888896e+07,
+      "cpu_time": 3.4000555555419647e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.9938888888888894e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/1024/manual_time",
+      "family_index": 14,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.0120999999999993e+07,
+      "cpu_time": 3.1871235293997015e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.0121000000000001e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/2048/manual_time",
+      "family_index": 14,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.0639058823529415e+07,
+      "cpu_time": 3.3751764705873953e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.0639058823529418e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/4096/manual_time",
+      "family_index": 14,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 16,
+      "real_time": 4.2631812499999993e+07,
+      "cpu_time": 2.2928124999932465e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.2631812499999988e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/8192/manual_time",
+      "family_index": 14,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/sems_1_core_1_processor_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 15,
+      "real_time": 4.5801333333333336e+07,
+      "cpu_time": 2.5240666666803692e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.5801333333333330e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/256/manual_time",
+      "family_index": 15,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.1285058823529415e+07,
+      "cpu_time": 2.2585352941169556e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.1285058823529412e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/512/manual_time",
+      "family_index": 15,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.1379176470588244e+07,
+      "cpu_time": 2.3659999999882959e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.1379176470588239e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/1024/manual_time",
+      "family_index": 15,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.1582117647058822e+07,
+      "cpu_time": 3.2424705882259357e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.1582117647058827e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/2048/manual_time",
+      "family_index": 15,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 17,
+      "real_time": 4.1956352941176474e+07,
+      "cpu_time": 2.7511764705762434e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.1956352941176474e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/4096/manual_time",
+      "family_index": 15,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 16,
+      "real_time": 4.3004125000000007e+07,
+      "cpu_time": 2.6398124999937878e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.3004125000000004e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/8192/manual_time",
+      "family_index": 15,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/sems_all_cores_1_processor_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 15,
+      "real_time": 4.5527999999999993e+07,
+      "cpu_time": 2.6297333333265746e+04,
+      "time_unit": "ns",
+      "IterationTime": 4.5527999999999990e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/maxed_config_params_trace/256/manual_time",
+      "family_index": 16,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/maxed_config_params_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.1524766666666664e+08,
+      "cpu_time": 2.6369999999905267e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1524766666666667e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/maxed_config_params_trace/512/manual_time",
+      "family_index": 16,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/maxed_config_params_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.1574033333333333e+08,
+      "cpu_time": 2.4333333333477942e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1574033333333335e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/maxed_config_params_trace/1024/manual_time",
+      "family_index": 16,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/maxed_config_params_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.1764433333333333e+08,
+      "cpu_time": 2.4313333333007376e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1764433333333332e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/maxed_config_params_trace/2048/manual_time",
+      "family_index": 16,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/maxed_config_params_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.2013483333333333e+08,
+      "cpu_time": 2.6438333333563452e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.2013483333333333e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/maxed_config_params_trace/4096/manual_time",
+      "family_index": 16,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/maxed_config_params_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.2723566666666667e+08,
+      "cpu_time": 2.4175000000070668e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.2723566666666668e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/maxed_config_params_trace/8192/manual_time",
+      "family_index": 16,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/maxed_config_params_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 4,
+      "real_time": 1.5994525000000000e+08,
+      "cpu_time": 2.7784999998559102e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.5994525000000003e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_trace/256/manual_time",
+      "family_index": 17,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/kernel_groups_trace/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.2403200000000000e+08,
+      "cpu_time": 2.7731666666142017e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.2403199999999998e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_trace/512/manual_time",
+      "family_index": 17,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/kernel_groups_trace/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.2482033333333336e+08,
+      "cpu_time": 5.6040000000479042e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.2482033333333334e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_trace/1024/manual_time",
+      "family_index": 17,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/kernel_groups_trace/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 6,
+      "real_time": 1.2640816666666664e+08,
+      "cpu_time": 3.8410000000747379e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.2640816666666664e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_trace/2048/manual_time",
+      "family_index": 17,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/kernel_groups_trace/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 5,
+      "real_time": 1.3163460000000003e+08,
+      "cpu_time": 2.6059999999006322e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.3163460000000003e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_trace/4096/manual_time",
+      "family_index": 17,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/kernel_groups_trace/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 4,
+      "real_time": 1.7612950000000000e+08,
+      "cpu_time": 2.7122500000587024e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.7612949999999999e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_trace/8192/manual_time",
+      "family_index": 17,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/kernel_groups_trace/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 2,
+      "real_time": 2.9200850000000000e+08,
+      "cpu_time": 4.6969999999646461e+04,
+      "time_unit": "ns",
+      "IterationTime": 2.9200849999999997e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_4_shadow/256/manual_time",
+      "family_index": 18,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 7.4630000000000000e+08,
+      "cpu_time": 4.9730000000636210e+04,
+      "time_unit": "ns",
+      "IterationTime": 7.4629999999999995e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_4_shadow/512/manual_time",
+      "family_index": 18,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 7.4840300000000000e+08,
+      "cpu_time": 5.2229999994324317e+04,
+      "time_unit": "ns",
+      "IterationTime": 7.4840300000000002e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_4_shadow/1024/manual_time",
+      "family_index": 18,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 7.5336900000000000e+08,
+      "cpu_time": 6.1159999994231388e+04,
+      "time_unit": "ns",
+      "IterationTime": 7.5336899999999994e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_4_shadow/2048/manual_time",
+      "family_index": 18,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 7.6541000000000000e+08,
+      "cpu_time": 6.1379999998223415e+04,
+      "time_unit": "ns",
+      "IterationTime": 7.6540999999999998e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_4_shadow/4096/manual_time",
+      "family_index": 18,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 8.1861000000000000e+08,
+      "cpu_time": 5.8789999997088671e+04,
+      "time_unit": "ns",
+      "IterationTime": 8.1860999999999994e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_4_shadow/8192/manual_time",
+      "family_index": 18,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/kernel_groups_4_shadow/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 1.5095220000000000e+09,
+      "cpu_time": 6.5339999999025618e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.5095220000000000e-04
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_5_shadow/256/manual_time",
+      "family_index": 19,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 8.5681100000000000e+08,
+      "cpu_time": 6.8460999997910229e+04,
+      "time_unit": "ns",
+      "IterationTime": 8.5681100000000002e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_5_shadow/512/manual_time",
+      "family_index": 19,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 8.5953100000000000e+08,
+      "cpu_time": 3.9960000002281507e+04,
+      "time_unit": "ns",
+      "IterationTime": 8.5953100000000002e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_5_shadow/1024/manual_time",
+      "family_index": 19,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 8.6593400000000000e+08,
+      "cpu_time": 6.1980000005235066e+04,
+      "time_unit": "ns",
+      "IterationTime": 8.6593400000000003e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_5_shadow/2048/manual_time",
+      "family_index": 19,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 8.8192400000000000e+08,
+      "cpu_time": 3.2000000004472895e+04,
+      "time_unit": "ns",
+      "IterationTime": 8.8192399999999999e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_5_shadow/4096/manual_time",
+      "family_index": 19,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 9.5221200000000000e+08,
+      "cpu_time": 3.6610000002212924e+04,
+      "time_unit": "ns",
+      "IterationTime": 9.5221199999999989e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/kernel_groups_5_shadow/8192/manual_time",
+      "family_index": 19,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/kernel_groups_5_shadow/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 1.7597450000000000e+09,
+      "cpu_time": 6.0560000001430584e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.7597449999999997e-04
+    },
+    {
+      "name": "BM_pgm_dispatch/eth_dispatch/256/manual_time",
+      "family_index": 20,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/eth_dispatch/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.9592777777777776e+07,
+      "cpu_time": 2.7309444444512112e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.9592777777777775e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/eth_dispatch/512/manual_time",
+      "family_index": 20,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/eth_dispatch/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.9590499999999985e+07,
+      "cpu_time": 2.7111666666554709e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.9590499999999990e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/eth_dispatch/1024/manual_time",
+      "family_index": 20,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/eth_dispatch/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.9585444444444440e+07,
+      "cpu_time": 2.6895000000119278e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.9585444444444443e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/eth_dispatch/2048/manual_time",
+      "family_index": 20,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/eth_dispatch/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.9583166666666657e+07,
+      "cpu_time": 2.7538333333391543e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.9583166666666666e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/eth_dispatch/4096/manual_time",
+      "family_index": 20,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/eth_dispatch/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.9602500000000000e+07,
+      "cpu_time": 3.0801111111126524e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.9602499999999999e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/eth_dispatch/8192/manual_time",
+      "family_index": 20,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/eth_dispatch/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 18,
+      "real_time": 3.9588166666666672e+07,
+      "cpu_time": 2.8473388888746362e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.9588166666666677e-06
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2/256/manual_time",
+      "family_index": 21,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 5,
+      "real_time": 1.5150599999999997e+08,
+      "cpu_time": 3.4743999999875537e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.5150599999999999e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2/512/manual_time",
+      "family_index": 21,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 5,
+      "real_time": 1.5150260000000000e+08,
+      "cpu_time": 3.8549999999304418e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.5150260000000000e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2/1024/manual_time",
+      "family_index": 21,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 5,
+      "real_time": 1.5151260000000000e+08,
+      "cpu_time": 4.0822599999046361e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.5151259999999999e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2/2048/manual_time",
+      "family_index": 21,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 5,
+      "real_time": 1.5151060000000000e+08,
+      "cpu_time": 3.7933999999495427e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.5151060000000000e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2/4096/manual_time",
+      "family_index": 21,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 4,
+      "real_time": 1.9537200000000003e+08,
+      "cpu_time": 4.6939999998940606e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.9537200000000003e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2/8192/manual_time",
+      "family_index": 21,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 2,
+      "real_time": 3.0045750000000006e+08,
+      "cpu_time": 6.5375000001921537e+04,
+      "time_unit": "ns",
+      "IterationTime": 3.0045750000000006e-05
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/256/manual_time",
+      "family_index": 22,
+      "per_family_instance_index": 0,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/256/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 1.1301980000000000e+09,
+      "cpu_time": 5.2300000000116139e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1301980000000000e-04
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/512/manual_time",
+      "family_index": 22,
+      "per_family_instance_index": 1,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/512/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 1.1301540000000000e+09,
+      "cpu_time": 6.7409999999767926e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1301540000000002e-04
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/1024/manual_time",
+      "family_index": 22,
+      "per_family_instance_index": 2,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/1024/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 1.1301960000000000e+09,
+      "cpu_time": 5.7210000001362001e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1301960000000001e-04
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/2048/manual_time",
+      "family_index": 22,
+      "per_family_instance_index": 3,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/2048/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 1.1301650000000000e+09,
+      "cpu_time": 6.5230000004135036e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.1301650000000000e-04
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/4096/manual_time",
+      "family_index": 22,
+      "per_family_instance_index": 4,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/4096/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 1.2159380000000000e+09,
+      "cpu_time": 5.6770000000483378e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.2159379999999999e-04
+    },
+    {
+      "name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/8192/manual_time",
+      "family_index": 22,
+      "per_family_instance_index": 5,
+      "run_name": "BM_pgm_dispatch/tensix_eth_2_4_shadow/8192/manual_time",
+      "run_type": "iteration",
+      "repetitions": 1,
+      "repetition_index": 0,
+      "threads": 1,
+      "iterations": 1,
+      "real_time": 1.7595660000000000e+09,
+      "cpu_time": 6.3640000000475542e+04,
+      "time_unit": "ns",
+      "IterationTime": 1.7595660000000000e-04
+    }
+  ]
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.log b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.log
deleted file mode 100644
index 4decfc57915..00000000000
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.log
+++ /dev/null
@@ -1,7 +0,0 @@
-2.70, 2.70, 2.92, 3.02, 3.15, 3.18, 3.50, 3.53, 3.46, 5.90, 4.07, 4.45, 5.69, 12.25, 4.05, 4.15, 11.55, 12.29, 74.37, 85.31, 4.13, 14.16, 109.50,
-2.71, 2.71, 2.93, 3.05, 3.23, 3.23, 3.55, 3.56, 3.49, 5.91, 4.08, 4.46, 5.75, 12.28, 4.05, 4.16, 11.60, 12.38, 74.64, 85.66, 4.13, 14.23, 109.89,
-2.71, 2.71, 3.02, 3.23, 3.39, 3.39, 3.72, 3.81, 3.67, 5.93, 4.10, 4.51, 6.04, 12.48, 4.06, 4.18, 11.78, 12.55, 75.12, 86.29, 4.13, 14.46, 110.95,
-2.78, 2.78, 3.23, 3.40, 3.61, 3.63, 3.96, 3.94, 3.88, 6.00, 4.13, 4.64, 6.18, 12.73, 4.11, 4.21, 12.03, 13.06, 76.39, 87.99, 4.13, 15.40, 114.50,
-3.00, 3.00, 3.66, 3.98, 4.32, 4.35, 4.66, 4.69, 4.64, 6.22, 4.21, 5.05, 6.78, 13.41, 4.32, 4.35, 12.73, 17.65, 82.46, 95.97, 4.13, 19.60, 122.75,
-3.29, 3.29, 4.51, 5.12, 5.79, 5.79, 6.27, 7.16, 6.02, 6.49, 4.34, 7.78, 8.28, 14.76, 4.64, 4.59, 16.45, 29.33, 151.60, 176.73,
-3.57, 3.57, 5.40, 6.27, 7.17, 7.21,
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
index b60d47c754f..4d4f5b9f615 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
@@ -10,13 +10,13 @@
 
 #include "core_coord.hpp"
 #include "logger.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/common/metal_soc_descriptor.h"
-#include "tt_metal/impl/event/event.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/metal_soc_descriptor.h>
+#include <tt-metalium/event.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/device.hpp>
 
 constexpr uint32_t DEFAULT_ITERATIONS = 1000;
 constexpr uint32_t DEFAULT_WARMUP_ITERATIONS = 2;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
index e219c708007..7d8a484b805 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp
@@ -7,10 +7,10 @@
 #include <random>
 
 #include "logger.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "common.h"
 
 constexpr uint32_t DEFAULT_ITERATIONS = 10000;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp
index afc9238c2e5..3d48eecb593 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp
@@ -3,11 +3,12 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "umd/device/types/cluster_descriptor_types.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <benchmark/benchmark.h>
 
 constexpr uint32_t DEFAULT_ITERATIONS = 10000;
 constexpr uint32_t DEFAULT_WARMUP_ITERATIONS = 100;
@@ -24,31 +25,55 @@ constexpr uint32_t MAX_ARGS = 255;
 using std::vector;
 using namespace tt;
 
-uint32_t iterations_g = DEFAULT_ITERATIONS;
-uint32_t warmup_iterations_g = DEFAULT_WARMUP_ITERATIONS;
-CoreRange workers_g = {{0, 0}, {0, 0}};
-uint32_t kernel_size_g;
-uint32_t fast_kernel_cycles_g;
-uint32_t slow_kernel_cycles_g;
-uint32_t nfast_kernels_g;
-uint32_t n_cbs_g;
-uint32_t n_args_g;
-uint32_t n_common_args_g;
-uint32_t n_sems_g;
-uint32_t n_kgs_g;
-bool brisc_enabled_g;
-bool ncrisc_enabled_g;
-bool trisc_enabled_g;
-bool erisc_enabled_g;
-uint32_t erisc_count_g;
-bool lazy_g;
-bool time_just_finish_g;
-bool use_global_g;
-bool use_trace_g;
-bool dispatch_from_eth_g;
-
-void init(int argc, char** argv) {
-    std::vector<std::string> input_args(argv, argv + argc);
+struct TestInfo {
+    uint32_t iterations = DEFAULT_ITERATIONS;
+    uint32_t warmup_iterations = DEFAULT_WARMUP_ITERATIONS;
+    CoreRange workers = {{0, 0}, {0, 0}};
+    uint32_t kernel_size{DEFAULT_KERNEL_SIZE_K * 1024};
+    uint32_t fast_kernel_cycles{0};
+    uint32_t slow_kernel_cycles{0};
+    uint32_t nfast_kernels{0};
+    uint32_t n_cbs{0};
+    uint32_t n_args{0};
+    uint32_t n_common_args{0};
+    uint32_t n_sems{0};
+    uint32_t n_kgs{1};
+    bool brisc_enabled{true};
+    bool ncrisc_enabled{true};
+    bool trisc_enabled{true};
+    bool erisc_enabled{false};
+    uint32_t erisc_count{1};
+    bool lazy{false};
+    bool time_just_finish{false};
+    bool use_global{false};
+    bool use_trace{false};
+    bool dispatch_from_eth{false};
+    bool use_all_cores{false};
+};
+
+std::tuple<uint32_t, uint32_t> get_core_count() {
+    uint32_t core_x = 0;
+    uint32_t core_y = 0;
+
+    std::string arch_name{getenv("ARCH_NAME")};
+    if (arch_name == "grayskull") {
+        core_x = 11;
+        core_y = 8;
+    } else if (arch_name == "wormhole_b0") {
+        core_x = 7;
+        core_y = 6;
+    } else if (arch_name == "blackhole") {
+        core_x = 12;
+        core_y = 9;
+    } else {
+        log_fatal("Unexpected ARCH_NAME {}", arch_name);
+        exit(0);
+    }
+    return std::make_tuple(core_x, core_y);
+}
+
+void init(const std::vector<std::string>& input_args, TestInfo& info) {
+    auto core_count = get_core_count();
 
     if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) {
         log_info(LogTest, "Usage:");
@@ -80,64 +105,75 @@ void init(int argc, char** argv) {
         log_info(LogTest, "  -z: enable dispatch lazy mode (default disabled)");
         log_info(LogTest, " -tr: enable trace (default disabled)");
         log_info(LogTest, " -de: dispatch from eth cores (default tensix)");
+        log_info(
+            LogTest,
+            " -ac: use all viable worker cores (default {}x{})",
+            std::get<0>(core_count),
+            std::get<1>(core_count));
         exit(0);
     }
 
     uint32_t core_x = test_args::get_command_option_uint32(input_args, "-x", 0);
     uint32_t core_y = test_args::get_command_option_uint32(input_args, "-y", 0);
-    warmup_iterations_g = test_args::get_command_option_uint32(input_args, "-w", DEFAULT_WARMUP_ITERATIONS);
-    iterations_g = test_args::get_command_option_uint32(input_args, "-i", DEFAULT_ITERATIONS);
-    kernel_size_g = test_args::get_command_option_uint32(input_args, "-s", DEFAULT_KERNEL_SIZE_K * 1024);
-    n_cbs_g = test_args::get_command_option_uint32(input_args, "-c", 0);
-    n_args_g = test_args::get_command_option_uint32(input_args, "-a", 0);
-    n_common_args_g = test_args::get_command_option_uint32(input_args, "-ca", 0);
-    n_sems_g = test_args::get_command_option_uint32(input_args, "-S", 0);
-    n_kgs_g = test_args::get_command_option_uint32(input_args, "-kg", 1);
-    lazy_g = test_args::has_command_option(input_args, "-z");
-    use_global_g = test_args::has_command_option(input_args, "-g");
-    time_just_finish_g = test_args::has_command_option(input_args, "-f");
-    fast_kernel_cycles_g = test_args::get_command_option_uint32(input_args, "-rf", 0);
-    slow_kernel_cycles_g = test_args::get_command_option_uint32(input_args, "-rs", 0);
-    nfast_kernels_g = test_args::get_command_option_uint32(input_args, "-nf", 0);
-    use_trace_g = test_args::has_command_option(input_args, "-tr");
-    dispatch_from_eth_g = test_args::has_command_option(input_args, "-de");
-    if (kernel_size_g < MIN_KERNEL_SIZE_BYTES) {
+
+    if (test_args::has_command_option(input_args, "-ac")) {
+        core_x = std::get<0>(core_count);
+        core_y = std::get<1>(core_count);
+    }
+
+    info.warmup_iterations = test_args::get_command_option_uint32(input_args, "-w", DEFAULT_WARMUP_ITERATIONS);
+    info.iterations = test_args::get_command_option_uint32(input_args, "-i", DEFAULT_ITERATIONS);
+    info.kernel_size = test_args::get_command_option_uint32(input_args, "-s", DEFAULT_KERNEL_SIZE_K * 1024);
+    info.n_cbs = test_args::get_command_option_uint32(input_args, "-c", 0);
+    info.n_args = test_args::get_command_option_uint32(input_args, "-a", 0);
+    info.n_common_args = test_args::get_command_option_uint32(input_args, "-ca", 0);
+    info.n_sems = test_args::get_command_option_uint32(input_args, "-S", 0);
+    info.n_kgs = test_args::get_command_option_uint32(input_args, "-kg", 1);
+    info.lazy = test_args::has_command_option(input_args, "-z");
+    info.use_global = test_args::has_command_option(input_args, "-g");
+    info.time_just_finish = test_args::has_command_option(input_args, "-f");
+    info.fast_kernel_cycles = test_args::get_command_option_uint32(input_args, "-rf", 0);
+    info.slow_kernel_cycles = test_args::get_command_option_uint32(input_args, "-rs", 0);
+    info.nfast_kernels = test_args::get_command_option_uint32(input_args, "-nf", 0);
+    info.use_trace = test_args::has_command_option(input_args, "-tr");
+    info.dispatch_from_eth = test_args::has_command_option(input_args, "-de");
+    if (info.kernel_size < MIN_KERNEL_SIZE_BYTES) {
         log_fatal("Minimum kernel size is {} bytes", MIN_KERNEL_SIZE_BYTES);
         exit(0);
     }
-    if (n_cbs_g > MAX_CBS) {
+    if (info.n_cbs > MAX_CBS) {
         log_fatal("CB count must be 0..{}", MAX_CBS);
         exit(0);
     }
-    if (n_args_g > MAX_ARGS) {
+    if (info.n_args > MAX_ARGS) {
         log_fatal("Runtime arg count must be 0..{}", MAX_ARGS);
         exit(0);
     }
-    if (n_common_args_g > MAX_ARGS) {
+    if (info.n_common_args > MAX_ARGS) {
         log_fatal("Common Runtime arg count must be 0..{}", MAX_ARGS);
         exit(0);
     }
-    if (n_sems_g > NUM_SEMAPHORES) {
+    if (info.n_sems > NUM_SEMAPHORES) {
         log_fatal("Sem count must be 0..{}", NUM_SEMAPHORES);
         exit(0);
     }
-    if (n_kgs_g > core_x + 1) {
+    if (info.n_kgs > core_x + 1) {
         log_fatal("This test uses columns for kernel groups so number of kernel groups must be <= x core range");
         exit(0);
     }
-    brisc_enabled_g = !test_args::has_command_option(input_args, "-b");
-    ncrisc_enabled_g = !test_args::has_command_option(input_args, "-n");
-    trisc_enabled_g = !test_args::has_command_option(input_args, "-t");
-    erisc_enabled_g = test_args::has_command_option(input_args, "+e");
-    erisc_count_g = test_args::get_command_option_uint32(input_args, "-ec", 1);
+    info.brisc_enabled = !test_args::has_command_option(input_args, "-b");
+    info.ncrisc_enabled = !test_args::has_command_option(input_args, "-n");
+    info.trisc_enabled = !test_args::has_command_option(input_args, "-t");
+    info.erisc_enabled = test_args::has_command_option(input_args, "+e");
+    info.erisc_count = test_args::get_command_option_uint32(input_args, "-ec", 1);
 
-    workers_g = CoreRange({0, 0}, {core_x, core_y});
+    info.workers = CoreRange({0, 0}, {core_x, core_y});
 
-    if (nfast_kernels_g != 0 && slow_kernel_cycles_g <= fast_kernel_cycles_g) {
+    if (info.nfast_kernels != 0 && info.slow_kernel_cycles <= info.fast_kernel_cycles) {
         log_error(
             "The number of fast kernels is non-zero, but slow_kernel_ cycles ({}) is <= fast_kernel_cycles ({})",
-            slow_kernel_cycles_g,
-            fast_kernel_cycles_g);
+            info.slow_kernel_cycles,
+            info.fast_kernel_cycles);
         log_error("For meaningful results, run multiple fast kernels between single slow kernels");
         exit(0);
     }
@@ -153,38 +189,39 @@ void set_runtime_args(
     }
 }
 
-void initialize_program(tt_metal::IDevice* device, tt_metal::Program& program, uint32_t run_cycles) {
+bool initialize_program(
+    const TestInfo& info, tt_metal::IDevice* device, tt_metal::Program& program, uint32_t run_cycles) {
     program = tt_metal::CreateProgram();
 
-    std::map<string, string> defines = {{"KERNEL_BYTES", std::to_string(kernel_size_g)}};
+    std::map<string, string> defines = {{"KERNEL_BYTES", std::to_string(info.kernel_size)}};
     if (run_cycles != 0) {
         defines.insert(std::pair<string, string>("KERNEL_RUN_TIME", std::to_string(run_cycles)));
     }
-    if (use_global_g) {
+    if (info.use_global) {
         defines.insert(std::pair<string, string>("KERNEL_GLOBAL", "1"));
     }
 
-    for (uint32_t i = 0; i < n_sems_g; i++) {
-        tt_metal::CreateSemaphore(program, workers_g, 3);
+    for (uint32_t i = 0; i < info.n_sems; i++) {
+        tt_metal::CreateSemaphore(program, info.workers, 3);
     }
 
     vector<uint32_t> args;
-    args.resize(n_args_g);
+    args.resize(info.n_args);
     vector<uint32_t> common_args;
-    common_args.resize(n_common_args_g);
+    common_args.resize(info.n_common_args);
 
-    for (int i = 0; i < n_cbs_g; i++) {
+    for (int i = 0; i < info.n_cbs; i++) {
         tt_metal::CircularBufferConfig cb_config =
             tt_metal::CircularBufferConfig(16, {{i, tt::DataFormat::Float16_b}}).set_page_size(i, 16);
-        auto cb = tt_metal::CreateCircularBuffer(program, workers_g, cb_config);
+        auto cb = tt_metal::CreateCircularBuffer(program, info.workers, cb_config);
     }
 
     // first kernel group is possibly wide, remaining kernel groups are 1 column each
-    CoreRange kg = {workers_g.start_coord, {workers_g.end_coord.x - n_kgs_g + 1, workers_g.end_coord.y}};
-    for (uint32_t i = 0; i < n_kgs_g; i++) {
+    CoreRange kg = {info.workers.start_coord, {info.workers.end_coord.x - info.n_kgs + 1, info.workers.end_coord.y}};
+    for (uint32_t i = 0; i < info.n_kgs; i++) {
         defines.insert(std::pair<string, string>(string("KG_") + std::to_string(i), ""));
 
-        if (brisc_enabled_g) {
+        if (info.brisc_enabled) {
             auto dm0 = tt_metal::CreateKernel(
                 program,
                 "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp",
@@ -197,7 +234,7 @@ void initialize_program(tt_metal::IDevice* device, tt_metal::Program& program, u
             tt_metal::SetCommonRuntimeArgs(program, dm0, common_args);
         }
 
-        if (ncrisc_enabled_g) {
+        if (info.ncrisc_enabled) {
             auto dm1 = tt_metal::CreateKernel(
                 program,
                 "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp",
@@ -210,7 +247,7 @@ void initialize_program(tt_metal::IDevice* device, tt_metal::Program& program, u
             tt_metal::SetCommonRuntimeArgs(program, dm1, common_args);
         }
 
-        if (trisc_enabled_g) {
+        if (info.trisc_enabled) {
             auto compute = tt_metal::CreateKernel(
                 program,
                 "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp",
@@ -224,17 +261,17 @@ void initialize_program(tt_metal::IDevice* device, tt_metal::Program& program, u
         kg.end_coord = kg.start_coord;
     }
 
-    if (erisc_enabled_g) {
+    if (info.erisc_enabled) {
         auto erisc_cores = device->get_active_ethernet_cores(true);
-        if (erisc_count_g > erisc_cores.size()) {
+        if (info.erisc_count > erisc_cores.size()) {
             log_fatal(
                 "Requested number of erisc cores {} exceeds actual erisc core count {}",
-                erisc_count_g,
+                info.erisc_count,
                 erisc_cores.size());
-            exit(0);
+            return false;
         }
         auto erisc_core = erisc_cores.begin();
-        for (uint32_t i = 0; i < erisc_count_g; i++, erisc_core++) {
+        for (uint32_t i = 0; i < info.erisc_count; i++, erisc_core++) {
             auto eth_kernel = CreateKernel(
                 program,
                 "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp",
@@ -248,95 +285,134 @@ void initialize_program(tt_metal::IDevice* device, tt_metal::Program& program, u
             tt_metal::SetCommonRuntimeArgs(program, eth_kernel, common_args);
         }
     }
+    return true;
 }
 
-int main(int argc, char** argv) {
-    init(argc, argv);
+struct FakeBenchmarkState {
+    std::vector<int>::iterator begin() { return range.begin(); }
+    std::vector<int>::iterator end() { return range.end(); }
+
+    std::vector<int> range{1};
+};
+
+template <typename T>
+static int pgm_dispatch(T& state, TestInfo info) {
+    if constexpr (std::is_same_v<T, benchmark::State>) {
+        log_info(LogTest, "Running {}", state.name());
+    }
+    if (info.use_all_cores) {
+        auto core_count = get_core_count();
+        info.workers = CoreRange({0, 0}, {std::get<0>(core_count), std::get<1>(core_count)});
+    }
+    if constexpr (std::is_same_v<T, benchmark::State>) {
+        info.kernel_size = state.range(0);
+    }
+
+    if (info.use_trace) {
+        log_info(LogTest, "Running with trace enabled");
+    }
+    log_info(LogTest, "Warmup iterations: {}", info.warmup_iterations);
+    log_info(LogTest, "Iterations: {}", info.iterations);
+    log_info(
+        LogTest,
+        "Grid: ({}-{}) ({} cores)",
+        info.workers.start_coord.str(),
+        info.workers.end_coord.str(),
+        info.workers.size());
+    log_info(LogTest, "Kernel size: {}", info.kernel_size);
+    if (info.nfast_kernels != 0) {
+        log_info(LogTest, "Fast kernel cycles: {}", info.fast_kernel_cycles);
+        log_info(LogTest, "Slow kernel cycles: {}", info.slow_kernel_cycles);
+        log_info(LogTest, "{} fast kernels between slow kernels", info.nfast_kernels);
+    } else {
+        log_info(LogTest, "Kernel cycles: {}", info.slow_kernel_cycles);
+    }
+    log_info(LogTest, "KGs: {}", info.n_kgs);
+    log_info(LogTest, "CBs: {}", info.n_cbs);
+    log_info(LogTest, "UniqueRTArgs: {}", info.n_args);
+    log_info(LogTest, "CommonRTArgs: {}", info.n_common_args);
+    log_info(LogTest, "Sems: {}", info.n_sems);
+    log_info(LogTest, "Lazy: {}", info.lazy);
 
     tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(true);
 
     bool pass = true;
     try {
         const chip_id_t device_id = 0;
-        DispatchCoreType dispatch_core_type = dispatch_from_eth_g ? DispatchCoreType::ETH : DispatchCoreType::WORKER;
+        DispatchCoreType dispatch_core_type = info.dispatch_from_eth ? DispatchCoreType::ETH : DispatchCoreType::WORKER;
         tt_metal::IDevice* device = tt_metal::CreateDevice(
             device_id, 1, DEFAULT_L1_SMALL_SIZE, 900000000, DispatchCoreConfig{dispatch_core_type});
         CommandQueue& cq = device->command_queue();
 
         tt_metal::Program program[2];
-        initialize_program(device, program[0], slow_kernel_cycles_g);
-        initialize_program(device, program[1], fast_kernel_cycles_g);
+        if (!initialize_program(info, device, program[0], info.slow_kernel_cycles)) {
+            if constexpr (std::is_same_v<T, benchmark::State>) {
+                state.SkipWithError("Program creation failed");
+            }
+            return 1;
+        }
+        if (!initialize_program(info, device, program[1], info.fast_kernel_cycles)) {
+            if constexpr (std::is_same_v<T, benchmark::State>) {
+                state.SkipWithError("Program creation failed");
+            }
+            return 1;
+        }
 
         // Cache stuff
-        for (int i = 0; i < warmup_iterations_g; i++) {
+        for (int i = 0; i < info.warmup_iterations; i++) {
             EnqueueProgram(cq, program[0], false);
-            for (int j = 0; j < nfast_kernels_g; j++) {
+            for (int j = 0; j < info.nfast_kernels; j++) {
                 EnqueueProgram(cq, program[1], false);
             }
         }
 
         auto main_program_loop = [&]() {
-            for (int i = 0; i < iterations_g; i++) {
+            for (int i = 0; i < info.iterations; i++) {
                 EnqueueProgram(cq, program[0], false);
-                for (int j = 0; j < nfast_kernels_g; j++) {
+                for (int j = 0; j < info.nfast_kernels; j++) {
                     EnqueueProgram(cq, program[1], false);
                 }
             }
         };
         uint32_t tid = 0;
-        if (use_trace_g) {
+        if (info.use_trace) {
             tid = BeginTraceCapture(device, cq.id());
             main_program_loop();
             EndTraceCapture(device, cq.id(), tid);
             Finish(cq);
         }
 
-        if (lazy_g) {
-            // Does this do anything?
+        if (info.lazy) {
             tt_metal::detail::SetLazyCommandQueueMode(true);
         }
 
-        auto start = std::chrono::system_clock::now();
-        if (use_trace_g) {
-            EnqueueTrace(cq, tid, false);
-        } else {
-            main_program_loop();
-        }
-        if (time_just_finish_g) {
-            start = std::chrono::system_clock::now();
+        for (auto _ : state) {
+            auto start = std::chrono::system_clock::now();
+            if (info.use_trace) {
+                EnqueueTrace(cq, tid, false);
+            } else {
+                main_program_loop();
+            }
+            if (info.time_just_finish) {
+                start = std::chrono::system_clock::now();
+            }
+            Finish(cq);
+            auto end = std::chrono::system_clock::now();
+
+            if constexpr (std::is_same_v<T, benchmark::State>) {
+                auto elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+                state.SetIterationTime(elapsed_seconds.count());
+            } else {
+                std::chrono::duration<double> elapsed_seconds = (end - start);
+                log_info(LogTest, "Ran in {}us", elapsed_seconds.count() * 1000 * 1000);
+                log_info(LogTest, "Ran in {}us per iteration", elapsed_seconds.count() * 1000 * 1000 / info.iterations);
+            }
         }
-        Finish(cq);
-        auto end = std::chrono::system_clock::now();
 
-        if (use_trace_g) {
-            log_info(LogTest, "Running with trace enabled");
+        if constexpr (std::is_same_v<T, benchmark::State>) {
+            state.counters["IterationTime"] = benchmark::Counter(
+                info.iterations, benchmark::Counter::kIsIterationInvariantRate | benchmark::Counter::kInvert);
         }
-        log_info(LogTest, "Warmup iterations: {}", warmup_iterations_g);
-        log_info(LogTest, "Iterations: {}", iterations_g);
-        log_info(
-            LogTest,
-            "Grid: ({}-{}) ({} cores)",
-            workers_g.start_coord.str(),
-            workers_g.end_coord.str(),
-            workers_g.size());
-        log_info(LogTest, "Kernel size: {}", kernel_size_g);
-        if (nfast_kernels_g != 0) {
-            log_info(LogTest, "Fast kernel cycles: {}", fast_kernel_cycles_g);
-            log_info(LogTest, "Slow kernel cycles: {}", slow_kernel_cycles_g);
-            log_info(LogTest, "{} fast kernels between slow kernels", nfast_kernels_g);
-        } else {
-            log_info(LogTest, "Kernel cycles: {}", slow_kernel_cycles_g);
-        }
-        log_info(LogTest, "KGs: {}", n_kgs_g);
-        log_info(LogTest, "CBs: {}", n_cbs_g);
-        log_info(LogTest, "UniqueRTArgs: {}", n_args_g);
-        log_info(LogTest, "CommonRTArgs: {}", n_common_args_g);
-        log_info(LogTest, "Sems: {}", n_sems_g);
-        log_info(LogTest, "Lazy: {}", lazy_g);
-
-        std::chrono::duration<double> elapsed_seconds = (end - start);
-        log_info(LogTest, "Ran in {}us", elapsed_seconds.count() * 1000 * 1000);
-        log_info(LogTest, "Ran in {}us per iteration", elapsed_seconds.count() * 1000 * 1000 / iterations_g);
 
         pass &= tt_metal::CloseDevice(device);
     } catch (const std::exception& e) {
@@ -350,7 +426,226 @@ int main(int argc, char** argv) {
         log_info(LogTest, "Test Passed");
         return 0;
     } else {
-        log_fatal(LogTest, "Test Failed\n");
+        if constexpr (std::is_same_v<T, benchmark::State>) {
+            state.SkipWithError("Test failed");
+        } else {
+            log_info(LogTest, "Test failed");
+        }
         return 1;
     }
 }
+
+static void BM_pgm_dispatch(benchmark::State& state, TestInfo info) { pgm_dispatch(state, info); }
+
+static void Max12288Args(benchmark::internal::Benchmark* b) {
+    b->Arg(256)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096)->Arg(8192)->Arg(12288);
+}
+
+static void Max8192Args(benchmark::internal::Benchmark* b) {
+    b->Arg(256)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096)->Arg(8192);
+}
+
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    brisc_only_trace,
+    TestInfo{.warmup_iterations = 5000, .ncrisc_enabled = false, .trisc_enabled = false, .use_trace = true})
+    ->Apply(Max12288Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    ncrisc_only_trace,
+    TestInfo{.warmup_iterations = 5000, .brisc_enabled = false, .trisc_enabled = false, .use_trace = true})
+    ->Apply(Max12288Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    trisc_only_trace,
+    TestInfo{.warmup_iterations = 5000, .brisc_enabled = false, .ncrisc_enabled = false, .use_trace = true})
+    ->Apply(Max12288Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    brisc_trisc_only_trace,
+    TestInfo{.warmup_iterations = 5000, .ncrisc_enabled = false, .use_trace = true})
+    ->Apply(Max12288Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(BM_pgm_dispatch, all_processors_trace, TestInfo{.warmup_iterations = 5000, .use_trace = true})
+    ->Apply(Max12288Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    all_processors_all_cores_trace,
+    TestInfo{.warmup_iterations = 5000, .use_trace = true, .use_all_cores = true})
+    ->Apply(Max12288Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    all_processors_all_cores_1cb,
+    TestInfo{.warmup_iterations = 5000, .n_cbs = 1, .use_trace = true, .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    all_processors_all_cores_32cb,
+    TestInfo{.warmup_iterations = 5000, .n_cbs = 32, .use_trace = true, .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch, all_processors_1_core_1_rta, TestInfo{.warmup_iterations = 5000, .n_args = 1, .use_trace = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    one_processor_all_cores_128_rta,
+    TestInfo{
+        .warmup_iterations = 5000,
+        .n_args = 128,
+        .ncrisc_enabled = false,
+        .trisc_enabled = false,
+        .use_trace = true,
+        .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    one_processors_all_cores_1_rta,
+    TestInfo{
+        .warmup_iterations = 5000,
+        .n_args = 1,
+        .ncrisc_enabled = false,
+        .trisc_enabled = false,
+        .use_trace = true,
+        .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    all_processors_all_cores_1_rta,
+    TestInfo{.warmup_iterations = 5000, .n_args = 1, .use_trace = true, .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    all_processors_all_cores_32_rta,
+    TestInfo{.warmup_iterations = 5000, .n_args = 32, .use_trace = true, .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    all_processors_all_cores_128_rta,
+    TestInfo{.warmup_iterations = 5000, .n_args = 128, .use_trace = true, .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    sems_1_core_1_processor_trace,
+    TestInfo{
+        .warmup_iterations = 5000, .n_sems = 4, .ncrisc_enabled = false, .trisc_enabled = false, .use_trace = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    sems_all_cores_1_processor_trace,
+    TestInfo{
+        .warmup_iterations = 5000,
+        .n_sems = 4,
+        .ncrisc_enabled = false,
+        .trisc_enabled = false,
+        .use_trace = true,
+        .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    maxed_config_params_trace,
+    TestInfo{
+        .warmup_iterations = 5000, .n_cbs = 32, .n_args = 128, .n_sems = 4, .use_trace = true, .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+BENCHMARK_CAPTURE(
+    BM_pgm_dispatch,
+    kernel_groups_trace,
+    TestInfo{.warmup_iterations = 5000, .n_kgs = 8, .use_trace = true, .use_all_cores = true})
+    ->Apply(Max8192Args)
+    ->UseManualTime();
+int main(int argc, char** argv) {
+    std::vector<std::string> input_args(argv, argv + argc);
+    if (test_args::has_command_option(input_args, "--custom")) {
+        TestInfo info;
+        init(input_args, info);
+        FakeBenchmarkState state;
+        return pgm_dispatch(state, info);
+    }
+
+    auto core_count = get_core_count();
+
+    benchmark::RegisterBenchmark(
+        "BM_pgm_dispatch/kernel_groups_4_shadow",
+        BM_pgm_dispatch,
+        TestInfo{
+            .warmup_iterations = 5000,
+            .slow_kernel_cycles = 40000,
+            .nfast_kernels = 4,
+            .n_kgs = std::get<0>(core_count),
+            .use_trace = true,
+            .use_all_cores = true})
+        ->Apply(Max8192Args)
+        ->UseManualTime();
+    benchmark::RegisterBenchmark(
+        "BM_pgm_dispatch/kernel_groups_5_shadow",
+        BM_pgm_dispatch,
+        TestInfo{
+            .warmup_iterations = 5000,
+            .slow_kernel_cycles = 40000,
+            .nfast_kernels = 5,
+            .n_kgs = std::get<0>(core_count),
+            .use_trace = true,
+            .use_all_cores = true})
+        ->Apply(Max8192Args)
+        ->UseManualTime();
+    if (getenv("ARCH_NAME") == std::string("wormhole_b0")) {
+        benchmark::RegisterBenchmark(
+            "BM_pgm_dispatch/eth_dispatch",
+            BM_pgm_dispatch,
+            TestInfo{
+                .warmup_iterations = 5000,
+                .brisc_enabled = false,
+                .ncrisc_enabled = false,
+                .trisc_enabled = false,
+                .erisc_enabled = true,
+                .use_trace = true})
+            ->Apply(Max8192Args)
+            ->UseManualTime();
+        benchmark::RegisterBenchmark(
+            "BM_pgm_dispatch/tensix_eth_2",
+            BM_pgm_dispatch,
+            TestInfo{
+                .warmup_iterations = 5000,
+                .n_args = 16,
+                .n_kgs = std::get<0>(core_count),
+                .erisc_enabled = true,
+                .use_trace = true,
+                .use_all_cores = true})
+            ->Apply(Max8192Args)
+            ->UseManualTime();
+        benchmark::RegisterBenchmark(
+            "BM_pgm_dispatch/tensix_eth_2_4_shadow",
+            BM_pgm_dispatch,
+            TestInfo{
+                .warmup_iterations = 5000,
+                .slow_kernel_cycles = 40000,
+                .nfast_kernels = 4,
+                .n_args = 16,
+                .n_kgs = std::get<0>(core_count),
+                .erisc_enabled = true,
+                .use_trace = true,
+                .use_all_cores = true})
+            ->Apply(Max8192Args)
+            ->UseManualTime();
+    }
+
+    benchmark::Initialize(&argc, argv);
+    benchmark::RunSpecifiedBenchmarks();
+    benchmark::Shutdown();
+    return 0;
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
index a1345df01f4..33aa1ec2e99 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp
@@ -7,17 +7,17 @@
 #include <random>
 
 #include "assert.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
-#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
+#include <tt-metalium/command_queue_interface.hpp>
 #include "common.h"
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp"
 
-#include "llrt/hal.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/hal.hpp>
+#include <tt-metalium/llrt.hpp>
 
 #define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST)
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
index dad87546220..e964b39ccf0 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
@@ -10,21 +10,21 @@
 #include <tuple>
 
 #include "umd/device/types/arch.h"
-#include "impl/device/device.hpp"
-#include "impl/kernels/kernel_types.hpp"
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/kernel_types.hpp>
 #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
-#include "tt_metal/detail/persistent_kernel_cache.hpp"
+#include <tt-metalium/persistent_kernel_cache.hpp>
 
 // TODO: ARCH_NAME specific, must remove
 #include "eth_l1_address_map.h"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp
index 863a6e9e0a7..5e8a4b23024 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp
@@ -8,26 +8,26 @@
 #include <random>
 #include <tuple>
 
-#include "tt_metal/distributed/mesh_device_view.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/mesh_device_view.hpp>
+#include <tt-metalium/logger.hpp>
 #include "umd/device/types/arch.h"
-#include "impl/device/device.hpp"
-#include "impl/kernels/data_types.hpp"
-#include "impl/kernels/kernel_types.hpp"
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/data_types.hpp>
+#include <tt-metalium/kernel_types.hpp>
 #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
-#include "tt_metal/detail/persistent_kernel_cache.hpp"
-#include "tt_metal/distributed/mesh_device.hpp"
+#include <tt-metalium/persistent_kernel_cache.hpp>
+#include <tt-metalium/mesh_device.hpp>
 
 // TODO: ARCH_NAME specific, must remove
 #include "eth_l1_address_map.h"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp
index f99bd9bf761..2fe2782bd80 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_link_ping_latency_no_edm.cpp
@@ -11,21 +11,21 @@
 #include <map>
 
 #include "umd/device/types/arch.h"
-#include "impl/device/device.hpp"
-#include "impl/kernels/kernel_types.hpp"
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/kernel_types.hpp>
 #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
-#include "tt_metal/detail/persistent_kernel_cache.hpp"
+#include <tt-metalium/persistent_kernel_cache.hpp>
 
 // TODO: ARCH_NAME specific, must remove
 #include "eth_l1_address_map.h"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp
index 93ff2c2306c..b8d8917462c 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_read_and_send_data.cpp
@@ -8,13 +8,13 @@
 #include <limits>
 #include <random>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/device.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp
index 6f88ffc052c..ba6aff47157 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_send_data_looping.cpp
@@ -8,11 +8,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/print_helpers.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
index 24766ee52ba..a06c59ca543 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
@@ -10,13 +10,13 @@
 
 #include "umd/device/types/arch.h"
 #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/kernel.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp
index bffa1ac4a9d..84188fc6dc0 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/bfloat16.hpp"
-#include "test_tiles.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/debug/dprint_server.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/dprint_server.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
index b0fc0c0263b..8eb19b2c975 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
@@ -8,13 +8,13 @@
 #include <random>
 #include <thread>
 
-#include "common/bfloat16.hpp"
-#include "test_tiles.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/debug/dprint_server.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/dprint_server.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp
index 81c98f09cac..68d382621e3 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp
@@ -8,13 +8,13 @@
 #include <random>
 #include <thread>
 
-#include "common/bfloat16.hpp"
-#include "test_tiles.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/debug/dprint_server.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/dprint_server.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/device.hpp>
 
 #define LAUNCH
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
index 16953e9121b..e73c32ef128 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
@@ -8,13 +8,13 @@
 #include <random>
 #include <thread>
 
-#include "common/bfloat16.hpp"
-#include "test_tiles.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/debug/dprint_server.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/dprint_server.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp
index e05f86bc7cb..9d66591086c 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp
@@ -8,13 +8,13 @@
 #include <random>
 #include <thread>
 
-#include "common/bfloat16.hpp"
-#include "test_tiles.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/debug/dprint_server.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/dprint_server.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 using std::chrono::duration_cast;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp
index c2d5950244b..930199dd4e7 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp
@@ -6,10 +6,10 @@
 #include <functional>
 #include <random>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/command_queue.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp
index 0948a1a2c4d..02f4ba02ab2 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp
@@ -6,10 +6,10 @@
 #include <functional>
 #include <random>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp
index 4ee1e5d2c62..bc4cb0b2896 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp
@@ -7,9 +7,9 @@
 #include <random>
 #include <string>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp
index 067923b7f4a..193e687648e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp
@@ -7,9 +7,9 @@
 #include <random>
 #include <string>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp
index 3801cb7f0e3..efd0340caea 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/llrt.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp
index e55c1508f85..6e73128fc1b 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp
@@ -5,8 +5,8 @@
 #pragma once
 
 #include <nlohmann/json.hpp>
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/llrt.hpp>
 
 static inline std::string to_string(pkt_dest_size_choices_t choice) {
     switch (choice) {
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp
index 3ade6a54ee5..9d50d4a397d 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp
@@ -2,15 +2,15 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
+#include <tt-metalium/device.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/llrt.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp
index fd6e46a9ea1..732a2cb0962 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/llrt.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp
index 17df3bb9252..352a3d107eb 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/device/device.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/rtoptions.hpp>
 #include "tt_fabric/control_plane.hpp"
-// #include "tt_metal/impl/dispatch/cq_commands.hpp"
+// #include <tt-metalium/cq_commands.hpp>
 // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/tt_fabric_traffic_gen_test.hpp"
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp
index f9cf3dc5d61..00051b59f00 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/device/device.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/rtoptions.hpp>
 #include "tt_fabric/control_plane.hpp"
 #include "tt_fabric/mesh_graph.hpp"
-//#include "tt_metal/impl/dispatch/cq_commands.hpp"
+//#include <tt-metalium/cq_commands.hpp>
 //#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/tt_fabric_traffic_gen_test.hpp"
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp
index 045306de177..159ad5b85be 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/llrt.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp
index cd6a6dd1a65..9dfe2b75ba0 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
+#include <tt-metalium/device.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/llrt.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp
index 0711cf9dd8d..7485fd1523c 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/rtoptions.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 #include "utils.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/llrt.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp
index c908e3f64e6..c658c7211fb 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/llrt.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp
index 20439f0768b..a3e8d9d74f2 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp
index 7d4ca2239b7..f96ca0c8528 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/device.hpp>
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp
index a2d0da427de..c1945c1b5aa 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/device.hpp>
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp
index 737b3cb6654..9348333bd56 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/device.hpp>
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp
index 9916726ac1b..cf6fb4609e6 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
+#include <tt-metalium/device.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp"
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp
index ed3d6356548..ad98c9a6d13 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "kernels/traffic_gen_test.hpp"
 #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp"
diff --git a/tests/tt_metal/tt_metal/stl/test_slotmap.cpp b/tests/tt_metal/tt_metal/stl/test_slotmap.cpp
index 76efff76a91..52a416972df 100644
--- a/tests/tt_metal/tt_metal/stl/test_slotmap.cpp
+++ b/tests/tt_metal/tt_metal/stl/test_slotmap.cpp
@@ -4,7 +4,7 @@
 
 #include <gtest/gtest.h>
 
-#include "tt_metal/tt_stl/slotmap.hpp"
+#include <tt-metalium/slotmap.hpp>
 
 MAKE_SLOTMAP_KEY(IntKey, uint16_t, 10);
 using IntSlotMap = tt::stl::SlotMap<IntKey, int>;
diff --git a/tests/tt_metal/tt_metal/test_add_two_ints.cpp b/tests/tt_metal/tt_metal/test_add_two_ints.cpp
index f1b9a0efcbb..6528cd1334c 100644
--- a/tests/tt_metal/tt_metal/test_add_two_ints.cpp
+++ b/tests/tt_metal/tt_metal/test_add_two_ints.cpp
@@ -6,8 +6,8 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 
 ////////////////////////////////////////////////////////////////////////////
 // Runs the add_two_ints kernel on BRISC to add two ints in L1
diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp
index 10b04920cc4..e6d191016b0 100644
--- a/tests/tt_metal/tt_metal/test_bcast.cpp
+++ b/tests/tt_metal/tt_metal/test_bcast.cpp
@@ -8,11 +8,11 @@
 #include <vector>
 #include <map>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "test_gold_impls.hpp"
 #include "constants.hpp"
 
diff --git a/tests/tt_metal/tt_metal/test_bfp4_conversion.cpp b/tests/tt_metal/tt_metal/test_bfp4_conversion.cpp
index 50bf3f36f3c..069794ec865 100644
--- a/tests/tt_metal/tt_metal/test_bfp4_conversion.cpp
+++ b/tests/tt_metal/tt_metal/test_bfp4_conversion.cpp
@@ -6,10 +6,10 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat4.hpp"
-#include "common/bfloat16.hpp"
-#include "common/test_tiles.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat4.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
 #include "stdio.h"
 
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/test_bfp8_conversion.cpp b/tests/tt_metal/tt_metal/test_bfp8_conversion.cpp
index 394772a6d52..3f5aeff713f 100644
--- a/tests/tt_metal/tt_metal/test_bfp8_conversion.cpp
+++ b/tests/tt_metal/tt_metal/test_bfp8_conversion.cpp
@@ -6,10 +6,10 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat8.hpp"
-#include "common/bfloat16.hpp"
-#include "common/test_tiles.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat8.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
 
 using namespace tt;
 
diff --git a/tests/tt_metal/tt_metal/test_bmm.cpp b/tests/tt_metal/tt_metal/test_bmm.cpp
index aa457909877..08b7bca83ae 100644
--- a/tests/tt_metal/tt_metal/test_bmm.cpp
+++ b/tests/tt_metal/tt_metal/test_bmm.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "test_gold_impls.hpp"
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/test_clean_init.cpp b/tests/tt_metal/tt_metal/test_clean_init.cpp
index ef34440069a..8e48190ef7b 100644
--- a/tests/tt_metal/tt_metal/test_clean_init.cpp
+++ b/tests/tt_metal/tt_metal/test_clean_init.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device_pool.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include <chrono>
 
 /*
diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp
index 6ad39e6e1a8..55a33a25667 100644
--- a/tests/tt_metal/tt_metal/test_compile_args.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_args.cpp
@@ -8,10 +8,10 @@
 
 #include <filesystem>
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
-#include "impl/debug/dprint_server.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/dprint_server.hpp>
+#include <tt-metalium/tt_metal.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp
index 8fdbc2ee337..c2426ae2fbc 100644
--- a/tests/tt_metal/tt_metal/test_compile_program.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_program.cpp
@@ -8,14 +8,14 @@
 #include <cstdlib>
 #include <filesystem>
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
-#include "tt_metal/llrt/tt_memory.h"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_memory.h>
 #include "tt_metal/detail/kernel_cache.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/tt_metal.hpp>
 
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/kernel.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
index 96c521a2158..f0b5d4e817f 100644
--- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
@@ -6,15 +6,15 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
-#include "tt_metal/llrt/tt_memory.h"
-#include "tt_metal/llrt/llrt.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/tt_memory.h>
+#include <tt-metalium/llrt.hpp>
 #include "tt_metal/detail/kernel_cache.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
-#include "llrt/hal.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/kernel.hpp>
+#include <tt-metalium/device_pool.hpp>
+#include <tt-metalium/hal.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp
index e62a499eb07..d61a1b9b9c6 100644
--- a/tests/tt_metal/tt_metal/test_core_range_set.cpp
+++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp
@@ -7,12 +7,12 @@
 #include <random>
 #include <optional>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
-#include "tt_metal/impl/buffers/semaphore.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/semaphore.hpp>
+#include <tt-metalium/kernel.hpp>
+#include <tt-metalium/circular_buffer.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_datacopy.cpp b/tests/tt_metal/tt_metal/test_datacopy.cpp
index a662a8d0ce0..86757a46764 100644
--- a/tests/tt_metal/tt_metal/test_datacopy.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 // #include "tt_metal/tools/tt_gdb/tt_gdb.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
index a34c62e9ff8..9e0922ec954 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat8.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat8.hpp>
 // //#include "tt_metal/tools/tt_gdb/tt_gdb.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/util.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp
index 1d46cfaf087..fa52033d630 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
index b472e238318..2b075cfe613 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 // #include "tt_metal/tools/tt_gdb/tt_gdb.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp
index a81687b70ce..efdb93ef307 100644
--- a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp
+++ b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp
index e2c023b7db0..db0eb1d53b2 100644
--- a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 // #include "tt_gdb/tt_gdb.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp
index e79655d35b9..7849718f370 100644
--- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp
@@ -7,8 +7,8 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp
index a3756a9c677..e5833262e2b 100644
--- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp
@@ -7,8 +7,8 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp
index 7bf100f9b79..8b78a8bdae4 100644
--- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // 1. Host writes data to buffer in DRAM
diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp
index 410e872b439..cd86fe23eb4 100644
--- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp
index fb3530496cc..b93ca53696a 100644
--- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp
index 3d986280500..f076a29c13b 100644
--- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
index e290b05d31d..34ca99428a1 100644
--- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
+++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "test_gold_impls.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/device.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/test_enqueue_program.cpp
index 5723328ae66..9a39d82f49c 100644
--- a/tests/tt_metal/tt_metal/test_enqueue_program.cpp
+++ b/tests/tt_metal/tt_metal/test_enqueue_program.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/tt_metal.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/test_flatten.cpp b/tests/tt_metal/tt_metal/test_flatten.cpp
index aa743630093..1dc7542f44c 100644
--- a/tests/tt_metal/tt_metal/test_flatten.cpp
+++ b/tests/tt_metal/tt_metal/test_flatten.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
-#include "llrt/llrt.hpp"
+#include <tt-metalium/llrt.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
index af519c94cf1..910882350b9 100644
--- a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
@@ -6,12 +6,12 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
-#include "common/constants.hpp"
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/constants.hpp>
 //////////////////////////////////////////////////////////////////////////////////////////
 // This test is similar to test_matmul_large_block.
 // The only difference is that it uses generic_binary_reader_kernel instead of reader_matmul_blocked kernel.
diff --git a/tests/tt_metal/tt_metal/test_gold_impls.hpp b/tests/tt_metal/tt_metal/test_gold_impls.hpp
index b2c83429990..f6164a41bce 100644
--- a/tests/tt_metal/tt_metal/test_gold_impls.hpp
+++ b/tests/tt_metal/tt_metal/test_gold_impls.hpp
@@ -7,8 +7,8 @@
 #include <vector>
 #include <cstdint>
 #include <limits>
-#include "test_tiles.hpp"
-#include "bfloat16.hpp"
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 using std::uint16_t;
 using std::uint32_t;
diff --git a/tests/tt_metal/tt_metal/test_interleaved_l1_buffer.cpp b/tests/tt_metal/tt_metal/test_interleaved_l1_buffer.cpp
index 1a837ce54a6..986b60a6d81 100644
--- a/tests/tt_metal/tt_metal/test_interleaved_l1_buffer.cpp
+++ b/tests/tt_metal/tt_metal/test_interleaved_l1_buffer.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
index b550abaea0e..f6a2e7b37c9 100644
--- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
+++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
@@ -7,13 +7,13 @@
 #include <random>
 #include <math.h>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
-#include "llrt/llrt.hpp"
+#include <tt-metalium/llrt.hpp>
 
-#include "impl/debug/dprint_server.hpp"
+#include <tt-metalium/dprint_server.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/circular_buffer/cb_non_blocking_master_test_kernel.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/circular_buffer/cb_non_blocking_master_test_kernel.cpp
index e809a6e8a6d..967d5f2c7cc 100644
--- a/tests/tt_metal/tt_metal/test_kernels/misc/circular_buffer/cb_non_blocking_master_test_kernel.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/misc/circular_buffer/cb_non_blocking_master_test_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/hw/inc/dataflow_api.h"
+#include <tt-metalium/dataflow_api.h>
 
 #include <cstddef>
 #include <cstdint>
diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/circular_buffer/cb_non_blocking_slave_test_kernel.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/circular_buffer/cb_non_blocking_slave_test_kernel.cpp
index f4d28b97366..ee563f3a7e9 100644
--- a/tests/tt_metal/tt_metal/test_kernels/misc/circular_buffer/cb_non_blocking_slave_test_kernel.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/misc/circular_buffer/cb_non_blocking_slave_test_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/hw/inc/dataflow_api.h"
+#include <tt-metalium/dataflow_api.h>
 
 #include <cstddef>
 #include <cstdint>
diff --git a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp
index ab314063338..33f2d253bcd 100644
--- a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp
+++ b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 // #include "tt_gdb/tt_gdb.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp
index ea56f402117..22005ebae09 100644
--- a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
index 2d9f1ce40f4..2b11027b701 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
@@ -6,12 +6,12 @@
 #include <functional>
 #include <random>
 
-#include "common/bfloat16.hpp"
-#include "test_tiles.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/tt_metal.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
index cc3523662e2..7f7f8a2a720 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "hostdevcommon/common_values.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
index e8d762a76e5..bb90a4c3a14 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "hostdevcommon/common_values.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
index 7d9c62cf1e9..2e92f418548 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 #include "hostdevcommon/common_values.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
index ddf5c116f42..195defe086f 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
index 426b9a4b3e3..2d457de3e58 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
index 34c11f5277d..270ffac4503 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
index 56ae4fb3a01..2709cffb20a 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
@@ -6,12 +6,12 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "tt_metal/test_utils/comparison.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
index 6d7145dab41..d844a82e973 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
index df8f42a2b93..8dc814282a8 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
@@ -6,10 +6,10 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat8.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat8.hpp>
+#include <tt-metalium/util.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
index 875a3b54b6f..7d1aba49776 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/tt_metal.hpp>
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // TODO: explain what test does
diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
index 242bf98678f..022201e6844 100644
--- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
+++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
@@ -6,10 +6,10 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
-#include "common/core_coord.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/core_coord.hpp>
 // #include "tt_gdb/tt_gdb.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_multiple_programs.cpp b/tests/tt_metal/tt_metal/test_multiple_programs.cpp
index e479058ca1b..8437ec36721 100644
--- a/tests/tt_metal/tt_metal/test_multiple_programs.cpp
+++ b/tests/tt_metal/tt_metal/test_multiple_programs.cpp
@@ -6,11 +6,11 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp
index 80ddbe1964b..788e3848ac3 100644
--- a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp
@@ -16,15 +16,15 @@
 
 #include "core_coord.hpp"
 #include "logger.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/common/metal_soc_descriptor.h"
-#include "tt_metal/impl/event/event.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/impl/device/device.hpp"
-#include "tt_metal/common/metal_soc_descriptor.h"
-#include "llrt/hal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/metal_soc_descriptor.h>
+#include <tt-metalium/event.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/metal_soc_descriptor.h>
+#include <tt-metalium/hal.hpp>
 
 using namespace tt;
 
diff --git a/tests/tt_metal/tt_metal/test_transpose_hc.cpp b/tests/tt_metal/tt_metal/test_transpose_hc.cpp
index 2def15f200e..270878b9d2e 100644
--- a/tests/tt_metal/tt_metal/test_transpose_hc.cpp
+++ b/tests/tt_metal/tt_metal/test_transpose_hc.cpp
@@ -6,12 +6,12 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "test_gold_impls.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/tt_metal.hpp>
 
-#include "test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
index c8c7cdd08e4..414834cc05f 100644
--- a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
+++ b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
@@ -6,9 +6,9 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include "test_gold_impls.hpp"
 
 using std::vector;
diff --git a/tests/ttnn/distributed/test_distributed_atexit.cpp b/tests/ttnn/distributed/test_distributed_atexit.cpp
index 8bdd4cc9790..283076076b2 100644
--- a/tests/ttnn/distributed/test_distributed_atexit.cpp
+++ b/tests/ttnn/distributed/test_distributed_atexit.cpp
@@ -7,7 +7,7 @@
 #include <cstddef>
 #include <ttnn/core.hpp>
 #include <ttnn/distributed/api.hpp>
-#include "distributed/mesh_device.hpp"
+#include <tt-metalium/mesh_device.hpp>
 
 namespace ttnn::distributed::test {
 
diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
index 647d374595d..bde1f18b703 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
@@ -12,12 +12,12 @@
 
 #include "umd/device/types/arch.h"
 // #include "tt_backend_api_types.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/experimental/hal.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/hal_exp.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
@@ -27,7 +27,7 @@
 #include "ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp"
 
-// #include "impl/kernels/kernel_types.hpp"
+// #include <tt-metalium/kernel_types.hpp>
 
 using namespace tt;
 using namespace tt::test_utils;
diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
index 818d127d87c..31109d4e7ae 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
@@ -3,12 +3,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/logger.hpp"
-#include "sub_device/sub_device_types.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/sub_device_types.hpp>
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
 #include "ttnn/common/constants.hpp"
@@ -23,10 +23,10 @@
 #include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp"
 #include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp"
 
-#include "tt_metal/distributed/mesh_device.hpp"
-#include "tt_metal/distributed/mesh_device_view.hpp"
+#include <tt-metalium/mesh_device.hpp>
+#include <tt-metalium/mesh_device_view.hpp>
 
-#include "tt_metal/impl/tile/tile.hpp"
+#include <tt-metalium/tile.hpp>
 
 #include "umd/device/types/arch.h"
 #include "umd/device/types/cluster_descriptor_types.h"
@@ -2805,7 +2805,7 @@ TEST(
 }
 
 #include "ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp"
-#include "tt_metal/common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 TEST(CclAsyncOp, ReduceScatterSmall_PersistentFabric) {
     const size_t dim = 3;
     const size_t num_links = 1;
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
index f7fbb9b2225..584a1d4d39c 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
@@ -5,12 +5,12 @@
 #include <ostream>
 #include "gtest/gtest.h"
 
-#include "tt_metal/common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/device.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/async_runtime.hpp"
 #include "ttnn/operations/functions.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 
 #include "common_tensor_test_utils.hpp"
 
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp
index 7ef367335f6..9df7805eaa9 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp
@@ -6,7 +6,7 @@
 #include <optional>
 #include <variant>
 
-#include "buffers/buffer_constants.hpp"
+#include <tt-metalium/buffer_constants.hpp>
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 #include "ttnn/cpp/ttnn/operations/creation.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp
index c7a3f077c73..e7186850df3 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp
@@ -4,12 +4,12 @@
 
 #include "gtest/gtest.h"
 
-#include "tt_metal/common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/device.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/async_runtime.hpp"
 #include "ttnn/operations/functions.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/layout/tensor_layout.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp
index a0906934751..50b9f2fa55a 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_distributed_tensor.cpp
@@ -7,6 +7,7 @@
 
 #include "ttnn/distributed/api.hpp"
 #include "ttnn/operations/functions.hpp"
+#include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn_test_fixtures.hpp"
 #include <ttnn/distributed/types.hpp>
 #include <ttnn/distributed/distributed_tensor.hpp>
@@ -21,12 +22,23 @@ TensorSpec get_tensor_spec(const ttnn::SimpleShape& shape, DataType dtype) {
     return TensorSpec(shape, TensorLayout(dtype, Layout::ROW_MAJOR, MemoryConfig{}));
 }
 
+TEST_F(TensorDistributionTest, DistributeToDevice) {
+    Tensor input_tensor = Tensor::from_vector(
+        std::vector<float>{42.F, 13.F, -99.F}, get_tensor_spec(ttnn::SimpleShape{1, 1, 1, 3}, DataType::FLOAT32));
+
+    auto mapper = replicate_tensor_to_mesh_mapper(*mesh_device_);
+
+    // If no device is provided, the tensor is kept on host.
+    EXPECT_TRUE(distribute_tensor(input_tensor, *mapper).storage_type() == StorageType::MULTI_DEVICE_HOST);
+    EXPECT_TRUE(distribute_tensor(input_tensor, *mapper, *mesh_device_).storage_type() == StorageType::MULTI_DEVICE);
+}
+
 TEST_F(TensorDistributionTest, Replication) {
     Tensor input_tensor = Tensor::from_vector(
         std::vector<float>{42.F, 13.F, -99.F}, get_tensor_spec(ttnn::SimpleShape{1, 1, 1, 3}, DataType::FLOAT32));
 
     auto mapper = replicate_tensor_to_mesh_mapper(*mesh_device_);
-    Tensor replicated_tensor = distribute_tensor(input_tensor, *mesh_device_, *mapper);
+    Tensor replicated_tensor = distribute_tensor(input_tensor, *mapper, *mesh_device_);
 
     std::vector<Tensor> device_tensors = get_device_tensors(replicated_tensor);
     EXPECT_EQ(device_tensors.size(), mesh_device_->num_devices());
@@ -43,12 +55,12 @@ TEST_F(TensorDistributionTest, Shard1DInvalidDim) {
 
     EXPECT_ANY_THROW({
         auto mapper = shard_tensor_to_mesh_mapper(*mesh_device_, -1);
-        Tensor sharded_tensor = distribute_tensor(input_tensor, *mesh_device_, *mapper);
+        Tensor sharded_tensor = distribute_tensor(input_tensor, *mapper, *mesh_device_);
     });
 
     EXPECT_ANY_THROW({
         auto mapper = shard_tensor_to_mesh_mapper(*mesh_device_, 4);
-        Tensor sharded_tensor = distribute_tensor(input_tensor, *mesh_device_, *mapper);
+        Tensor sharded_tensor = distribute_tensor(input_tensor, *mapper, *mesh_device_);
     });
 }
 
@@ -60,7 +72,7 @@ TEST_F(TensorDistributionTest, Shard1DTooFewShards) {
 
     EXPECT_ANY_THROW({
         auto mapper = shard_tensor_to_mesh_mapper(*mesh_device_, 3);
-        Tensor sharded_tensor = distribute_tensor(input_tensor, *mesh_device_, *mapper);
+        Tensor sharded_tensor = distribute_tensor(input_tensor, *mapper, *mesh_device_);
     });
 }
 
@@ -74,7 +86,7 @@ TEST_F(TensorDistributionTest, Shard1D) {
         Tensor::from_vector(test_data, get_tensor_spec(ttnn::SimpleShape{1, num_devices, 3, 1}, DataType::FLOAT32));
 
     auto mapper = shard_tensor_to_mesh_mapper(*mesh_device_, 1);
-    Tensor sharded_tensor = distribute_tensor(input_tensor, *mesh_device_, *mapper);
+    Tensor sharded_tensor = distribute_tensor(input_tensor, *mapper, *mesh_device_);
 
     std::vector<Tensor> device_tensors = get_device_tensors(sharded_tensor);
     EXPECT_EQ(device_tensors.size(), mesh_device_->num_devices());
@@ -127,7 +139,7 @@ TEST_F(TensorDistributionTest, Shard2DReplicateDim) {
         Shard2dConfig{
             .row_dim = 1,
         });
-    Tensor sharded_tensor = distribute_tensor(input_tensor, *mesh_device_, *mapper);
+    Tensor sharded_tensor = distribute_tensor(input_tensor, *mapper, *mesh_device_);
     sharded_tensor.print();
 
     std::vector<Tensor> device_tensors = get_device_tensors(sharded_tensor);
@@ -162,7 +174,7 @@ TEST_F(TensorDistributionTest, Shard2D) {
             .row_dim = 1,
             .col_dim = 2,
         });
-    Tensor sharded_tensor = distribute_tensor(input_tensor, *mesh_device_, *mapper);
+    Tensor sharded_tensor = distribute_tensor(input_tensor, *mapper, *mesh_device_);
 
     std::vector<Tensor> device_tensors = get_device_tensors(sharded_tensor);
     EXPECT_EQ(device_tensors.size(), mesh_device_->num_devices());
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_sharding_with_alignment.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_sharding_with_alignment.cpp
index ec87aad1fb1..4893c0e876f 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_sharding_with_alignment.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_sharding_with_alignment.cpp
@@ -3,9 +3,9 @@
 
 #include "common_tensor_test_utils.hpp"
 #include "gtest/gtest.h"
-#include "host_api.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/async_runtime.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/tensor/tensor_spec.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp
index e98e6f2d736..89f91c5b083 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <host_api.hpp>
+#include <tt-metalium/host_api.hpp>
 #include <ttnn/tensor/tensor.hpp>
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 #include "ttnn/operations/creation.hpp"
 #include "ttnn/tensor/layout/tensor_layout.hpp"
 #include "ttnn/tensor/types.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp
index cd3e9709f49..7f65fad8099 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_vector_conversion.cpp
@@ -8,7 +8,7 @@
 #include <cstdint>
 
 #include "tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/tensor/types.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
index 3c654477f95..91f6c12299d 100644
--- a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
@@ -9,9 +9,9 @@
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
 #include "ttnn/operations/moreh/moreh_sum/moreh_sum.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/async_runtime.hpp"
-#include "tt_metal/impl/event/event.hpp"
+#include <tt-metalium/event.hpp>
 #include <cmath>
 
 namespace tt::tt_metal {
diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
index f81af80337e..0057fecb391 100644
--- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
 #include "ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
index 8e936a90ded..b3b90cffc0d 100644
--- a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "gtest/gtest.h"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 #include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp"
 #include "ttnn/device.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/test_graph_query_op_constraints.cpp b/tests/ttnn/unit_tests/gtests/test_graph_query_op_constraints.cpp
index 1a3e6026472..0320b2c316e 100644
--- a/tests/ttnn/unit_tests/gtests/test_graph_query_op_constraints.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_graph_query_op_constraints.cpp
@@ -13,13 +13,13 @@
 #include <variant>
 #include <vector>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "gtest/gtest.h"
-#include "impl/event/event.hpp"
-#include "impl/program/program.hpp"
+#include <tt-metalium/event.hpp>
+#include <tt-metalium/program_impl.hpp>
 #include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp"
 #include <nlohmann/json.hpp>
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 #include "ttnn/device.hpp"
 #include "ttnn/graph/graph_operation_queries.hpp"
 #include "ttnn/graph/graph_processor.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
index 133b8d7db5f..d314f953064 100644
--- a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
@@ -7,10 +7,10 @@
 #include "ttnn_multi_command_queue_fixture.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/async_runtime.hpp"
 #include "ttnn/operations/functions.hpp"
-#include "tt_metal/impl/event/event.hpp"
+#include <tt-metalium/event.hpp>
 #include <cmath>
 
 using namespace tt;
diff --git a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
index 35caa29e1f6..3761e0e4b1e 100644
--- a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
@@ -5,12 +5,12 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/layout/tensor_layout.hpp"
 #include "ttnn_multi_command_queue_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/tt_metal.hpp>
 #include "ttnn/operations/eltwise/binary/binary.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/async_runtime.hpp"
 #include "ttnn/operations/functions.hpp"
-#include "tt_metal/impl/event/event.hpp"
+#include <tt-metalium/event.hpp>
 #include <cmath>
 #include <thread>
 
diff --git a/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp b/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp
index 9e42e3781b4..98893a9ae22 100644
--- a/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp
+++ b/tests/ttnn/unit_tests/gtests/ttnn_multi_command_queue_fixture.hpp
@@ -5,11 +5,11 @@
 #pragma once
 
 #include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include <tt-metalium/tt_metal.hpp>
 
 namespace ttnn {
 
diff --git a/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp b/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp
index 41ef3a9e71e..ac0951e1975 100644
--- a/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp
+++ b/tests/ttnn/unit_tests/gtests/ttnn_test_fixtures.hpp
@@ -14,9 +14,9 @@
 #include "ttnn/device.hpp"
 #include "ttnn/types.hpp"
 #include "tests/tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "hostdevcommon/common_values.hpp"
-#include "tt_metal/distributed/mesh_device.hpp"
+#include <tt-metalium/mesh_device.hpp>
 
 namespace ttnn {
 
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py
index 6bd93f99eb5..7534038d205 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py
@@ -254,10 +254,9 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
                     topology=ttnn.Topology.Linear,
                 )
 
-        if enable_persistent_fabric:
-            logger.info(f"Waiting for op")
-            ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)
-            logger.info(f"Done iteration")
+            if enable_persistent_fabric:
+                ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)
+        ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)
 
     if enable_persistent_fabric and teardown_persistent_fabric:
         logger.info("Tearing down persistent fabric interface")
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_async_TG_nightly.py b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_async_TG_nightly.py
new file mode 100644
index 00000000000..b572de93aab
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_async_TG_nightly.py
@@ -0,0 +1,330 @@
+# SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import pytest
+from loguru import logger
+import ttnn
+from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc
+from models.utility_functions import skip_for_grayskull
+from tests.ttnn.unit_tests.operations.ccl.test_ccl_common import (
+    create_and_load_sub_device_manager_with_fabric_interface,
+    teardown_fabric_interface,
+    create_global_semaphore_with_same_address,
+)
+
+from tests.ttnn.unit_tests.operations.ccl.test_all_gather_TG_post_commit import (
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows,
+)
+
+from tests.ttnn.unit_tests.operations.ccl.test_new_all_gather import (
+    run_all_gather_impl,
+)
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links",
+    [(4, 1)],
+    # [(4, 3)], Multi-links fails https://github.com/tenstorrent/tt-metal/issues/16699
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize("shard_grid_orientation", [ttnn.ShardOrientation.ROW_MAJOR])
+@pytest.mark.parametrize(
+    "tensor_mem_layout,per_chip_output_shape, dim, input_shard_shape,shard_grid,layout",
+    (
+        # LLama
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (1, 1, 32, 1024 * 4),
+            3,
+            (32, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (4, 1, 32, 1280),
+            0,
+            (32, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 4))}),
+            ttnn.TILE_LAYOUT,
+        ),
+    ),
+)
+@pytest.mark.parametrize("replication_factor", [8])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_line_all_gather_sharded_on_TG_rows_post_commit(
+    mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    input_shard_shape,
+    shard_grid,
+    shard_grid_orientation,
+    tensor_mem_layout,
+    dim,
+    num_links,
+    input_dtype,
+    layout,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=1,
+):
+    if len(mesh_device.get_devices()) != 32:
+        pytest.skip("Not TG!")
+    if input_dtype == ttnn.bfloat16 and per_chip_output_shape == (1, 1, 32, 1024 * 4):
+        pytest.skip("Skipped due to hang Issue #16699")
+    input_shard_spec = ttnn.ShardSpec(
+        shard_grid,
+        input_shard_shape,
+        shard_grid_orientation,
+    )
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        tensor_mem_layout,
+        dim,
+        num_links,
+        input_dtype,
+        layout,
+        ttnn.BufferType.L1,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        input_shard_spec=input_shard_spec,
+        num_iters=num_iters,
+        num_all_gather_instances=replication_factor,
+        cluster_axis=1,
+        use_all_gather_async=True,
+        enable_persistent_fabric=True,
+        create_persistent_fabric=True,
+        teardown_persistent_fabric=True,
+    )
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links",
+    [
+        (8, 1),
+    ],
+    # [(8, 4), (8, 3), (8, 2)], Multi-links fails https://github.com/tenstorrent/tt-metal/issues/16699
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize("shard_grid_orientation", [ttnn.ShardOrientation.ROW_MAJOR])
+@pytest.mark.parametrize(
+    "tensor_mem_layout, input_shape, dim, input_shard_shape,shard_grid,layout",
+    (
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (8, 1, 32, 2048),
+            0,
+            (32, 64),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (1, 8, 32, 2048),
+            1,
+            (32, 64),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (1, 1, 256, 2048),
+            2,
+            (32, 64),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+            (1, 1, 32, 16384),
+            3,
+            (32, 64),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+            (8, 1, 2048, 32),
+            0,
+            (64, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+            (1, 8, 2048, 32),
+            1,
+            (64, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+            (1, 1, 16384, 32),
+            2,
+            (64, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+        (
+            ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+            (1, 1, 2048, 256),
+            3,
+            (64, 32),
+            ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
+            ttnn.TILE_LAYOUT,
+        ),
+    ),
+)
+@pytest.mark.parametrize("replication_factor", [4])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_line_all_gather_sharded_on_TG_cols_post_commit(
+    mesh_device,
+    num_devices,
+    input_shape,
+    input_shard_shape,
+    shard_grid,
+    shard_grid_orientation,
+    tensor_mem_layout,
+    dim,
+    num_links,
+    input_dtype,
+    layout,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=1,
+):
+    if len(mesh_device.get_devices()) != 32:
+        pytest.skip("Not TG!")
+    if input_dtype == ttnn.bfloat16 and input_shape == (1, 1, 256, 2048):
+        pytest.skip("Skipped due to hang Issue #16699")
+    input_shard_spec = ttnn.ShardSpec(
+        shard_grid,
+        input_shard_shape,
+        shard_grid_orientation,
+    )
+
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        input_shape,
+        tensor_mem_layout,
+        dim,
+        num_links,
+        input_dtype,
+        layout,
+        ttnn.BufferType.L1,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        num_iters=num_iters,
+        input_shard_spec=input_shard_spec,
+        num_all_gather_instances=replication_factor,
+        cluster_axis=0,
+        use_all_gather_async=True,
+        enable_persistent_fabric=True,
+        create_persistent_fabric=True,
+        teardown_persistent_fabric=True,
+    )
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links, per_chip_output_shape, dim, layout",
+    [
+        (8, 1, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
+        (8, 1, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+        (8, 1, [1, 8, 32, 2048], 1, ttnn.TILE_LAYOUT),
+        (8, 1, [1, 8, 32, 2304], 1, ttnn.TILE_LAYOUT),
+        (8, 1, [1, 8, 32, 4096], 1, ttnn.TILE_LAYOUT),
+        # multi-links fails: https://github.com/tenstorrent/tt-metal/issues/16699
+        # (8, 4, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
+        # (8, 4, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+        # (8, 4, [1, 8, 32, 2048], 1, ttnn.TILE_LAYOUT),
+        # (8, 4, [1, 8, 32, 2304], 1, ttnn.TILE_LAYOUT),
+        # (8, 4, [1, 8, 32, 4096], 1, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_type",
+    [
+        ttnn.BufferType.DRAM,
+        ttnn.BufferType.L1,
+    ],
+)
+@pytest.mark.parametrize("replication_factor", [4])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_line_all_gather_on_TG_cols_nightly(
+    mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    dim,
+    num_links,
+    input_dtype,
+    layout,
+    buffer_type,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=1,
+):
+    if len(mesh_device.get_devices()) != 32:
+        pytest.skip("Not TG!")
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        ttnn.TensorMemoryLayout.INTERLEAVED,
+        dim,
+        num_links,
+        input_dtype,
+        layout,
+        buffer_type,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        num_iters=num_iters,
+        num_all_gather_instances=replication_factor,
+        cluster_axis=0,
+        use_all_gather_async=True,
+        enable_persistent_fabric=True,
+        create_persistent_fabric=True,
+        teardown_persistent_fabric=True,
+    )
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py
index 63c9fd1bd4e..ef0c2193e28 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py
@@ -279,6 +279,8 @@ def run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows(
                     memory_config=output_mem_config,
                     topology=ttnn.Topology.Linear,
                 )
+            if enable_persistent_fabric:
+                ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)
         ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)
 
     if enable_persistent_fabric and teardown_persistent_fabric:
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_async_TG_nightly.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_async_TG_nightly.py
new file mode 100644
index 00000000000..d7ff05200d0
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_async_TG_nightly.py
@@ -0,0 +1,154 @@
+# SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import pytest
+from loguru import logger
+import ttnn
+from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc
+from models.utility_functions import skip_for_grayskull
+from tests.ttnn.unit_tests.operations.ccl.test_reduce_scatter_TG_nightly import (
+    run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows,
+)
+from tests.ttnn.unit_tests.operations.ccl.test_ccl_common import (
+    create_and_load_sub_device_manager_with_fabric_interface,
+    teardown_fabric_interface,
+    create_global_semaphore_with_same_address,
+)
+
+
+# Enumerate the post-commit cases explicitly
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links, per_chip_output_shape, dim, layout",
+    [
+        (4, 2, [1, 4, 32, 2304], 1, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        # ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_type",
+    [
+        ttnn.BufferType.DRAM,
+        ttnn.BufferType.L1,
+    ],
+)
+@pytest.mark.parametrize("replication_factor", [8])  # 1, 8])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 10281600}], indirect=True)
+def test_line_reduce_scatter_on_TG_rows_post_commit(
+    mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    dim,
+    num_links,
+    math_op,
+    input_dtype,
+    layout,
+    buffer_type,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=16,
+):
+    if len(mesh_device.get_devices()) != 32:
+        pytest.skip("Not TG!")
+    run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        ttnn.TensorMemoryLayout.INTERLEAVED,
+        dim,
+        num_links,
+        math_op,
+        input_dtype,
+        layout,
+        buffer_type,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        num_iters=num_iters,
+        num_reduce_scatter_instances=replication_factor,
+        cluster_axis=1,
+        use_reduce_scatter_async=True,
+        enable_persistent_fabric=True,
+        create_persistent_fabric=True,
+        teardown_persistent_fabric=True,
+    )
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links, per_chip_output_shape, dim, layout",
+    [
+        (8, 2, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
+        (8, 2, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_type",
+    [
+        ttnn.BufferType.DRAM,
+    ],
+)
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("replication_factor", [4])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
+def test_line_reduce_scatter_on_TG_cols_post_commit(
+    mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    dim,
+    num_links,
+    math_op,
+    input_dtype,
+    layout,
+    buffer_type,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters=16,
+):
+    if len(mesh_device.get_devices()) != 32:
+        pytest.skip("Not TG!")
+
+    run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        ttnn.TensorMemoryLayout.INTERLEAVED,
+        dim,
+        num_links,
+        math_op,
+        input_dtype,
+        layout,
+        buffer_type,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        num_iters=num_iters,
+        num_reduce_scatter_instances=replication_factor,
+        cluster_axis=0,
+        use_reduce_scatter_async=True,
+        enable_persistent_fabric=True,
+        create_persistent_fabric=True,
+        teardown_persistent_fabric=True,
+    )
diff --git a/tests/ttnn/unit_tests/operations/test_permute.py b/tests/ttnn/unit_tests/operations/test_permute.py
index 1f54ded3cd6..f07e7993e45 100644
--- a/tests/ttnn/unit_tests/operations/test_permute.py
+++ b/tests/ttnn/unit_tests/operations/test_permute.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 
 # SPDX-License-Identifier: Apache-2.0
 
@@ -230,6 +230,7 @@ def test_permute_5d_blocked(shape, perm, memory_config, dtype, device):
 @skip_for_blackhole("tilize_block gives bad pcc after second iteration")
 @skip_for_grayskull("tilize_block gives bad pcc after second iteration")
 def test_permute_nd(device):
+    torch.manual_seed(2005)
     torch_tensor = torch.rand((1, 3, 16, 16, 16, 16), dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.ROW_MAJOR_LAYOUT, device=device)
     output_tensor = ttnn.permute(input_tensor, (0, 2, 4, 3, 5, 1))
@@ -239,6 +240,7 @@ def test_permute_nd(device):
 
 
 def test_permute_squeeze(device):
+    torch.manual_seed(2005)
     ones = ttnn.ones((1, 1, 3))
     tensor = ttnn.to_device(ones, device)
     out = ttnn.permute(tensor, (0, 1, 2))
@@ -253,6 +255,7 @@ def test_permute_squeeze(device):
 def test_permute_3D(shape, perm, layout, memory_config, dtype, device):
     if is_grayskull() and dtype == ttnn.float32:
         pytest.skip("Grayskull doesn't support float32")
+    torch.manual_seed(2005)
     torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=layout, device=device, dtype=dtype, memory_config=memory_config)
     output_tensor = ttnn.permute(input_tensor, perm)
@@ -263,6 +266,7 @@ def test_permute_3D(shape, perm, layout, memory_config, dtype, device):
 
 
 def test_nil_volume_permute(device):
+    torch.manual_seed(2005)
     torch_tensor = torch.rand([1, 0, 30, 32], dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, device=device)
     output_tensor = ttnn.permute(input_tensor, (0, 1, 3, 2))
@@ -273,6 +277,7 @@ def test_nil_volume_permute(device):
 
 
 def test_permute_5d_tiled_basic(device):
+    torch.manual_seed(2005)
     torch_tensor = torch.rand([10, 10, 10, 100, 100], dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, device=device)
     output_tensor = ttnn.permute(input_tensor, (2, 1, 0, 3, 4))
@@ -283,6 +288,7 @@ def test_permute_5d_tiled_basic(device):
 
 
 def test_permute_5d_tiled_swap(device):
+    torch.manual_seed(2005)
     torch_tensor = torch.rand([10, 10, 10, 100, 100], dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, device=device)
     output_tensor = ttnn.permute(input_tensor, (2, 1, 0, 4, 3))
@@ -296,6 +302,7 @@ def test_permute_5d_tiled_swap(device):
     "shape", [[1, 1, 32, 32], [2, 2, 32, 32], [32, 32, 32, 32], [1, 1, 64, 64], [2, 2, 64, 64], [32, 32, 64, 64]]
 )
 def test_permute_4d_cn(shape, device):
+    torch.manual_seed(2005)
     torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, device=device)
     output_tensor = ttnn.permute(input_tensor, (1, 0, 2, 3))
@@ -309,6 +316,7 @@ def test_permute_4d_cn(shape, device):
     "shape", [[1, 1, 32, 32], [2, 2, 32, 32], [32, 32, 32, 32], [1, 1, 64, 64], [2, 2, 64, 64], [32, 32, 64, 64]]
 )
 def test_permute_4d_wh(shape, device):
+    torch.manual_seed(2005)
     torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, device=device)
     output_tensor = ttnn.permute(input_tensor, (0, 1, 3, 2))
@@ -322,6 +330,7 @@ def test_permute_4d_wh(shape, device):
     "shape", [[1, 1, 32, 32], [2, 2, 32, 32], [32, 32, 32, 32], [1, 1, 64, 64], [2, 2, 64, 64], [32, 32, 64, 64]]
 )
 def test_permute_4d_cnwh(shape, device):
+    torch.manual_seed(2005)
     torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, device=device)
     output_tensor = ttnn.permute(input_tensor, (1, 0, 3, 2))
@@ -334,6 +343,7 @@ def test_permute_4d_cnwh(shape, device):
 @pytest.mark.parametrize("shape", [[2, 2, 2, 2, 2, 2, 32, 32]])
 @pytest.mark.parametrize("dims", [(5, 4, 3, 2, 1, 0, 7, 6), (5, 4, 3, 2, 1, 0, 6, 7)])
 def test_permute_8d_swapped(shape, dims, device):
+    torch.manual_seed(2005)
     torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, device=device)
     output_tensor = ttnn.permute(input_tensor, dims)
@@ -345,6 +355,7 @@ def test_permute_8d_swapped(shape, dims, device):
 
 @pytest.mark.parametrize("shape", [[1, 1, 32, 32]])
 def test_permute_identity(shape, device):
+    torch.manual_seed(2005)
     torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
     input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, device=device)
     output_tensor = ttnn.permute(input_tensor, (0, 1, 2, 3))
@@ -352,3 +363,70 @@ def test_permute_identity(shape, device):
     torch_output = torch.permute(torch_tensor, (0, 1, 2, 3))
     assert torch_output.shape == output_tensor.shape
     assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+@pytest.mark.parametrize("shape", [[2, 2, 67, 67, 65]])
+@pytest.mark.parametrize("perm", [(0, 1, 3, 2, 4)])
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
+def test_permute_5d_xh_pad(shape, perm, dtype, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
+    output_tensor = ttnn.permute(input_tensor, perm)
+    output_tensor = ttnn.to_torch(output_tensor)
+    torch_output = torch.permute(torch_tensor, perm)
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+def generate_fixed_w_permutations(N):
+    perms_Nd = generate_permutations(N - 1)
+    for perm in perms_Nd:
+        yield perm + (N - 1,)
+
+
+@pytest.mark.parametrize("shape", [[7, 7, 7, 33, 33]])
+@pytest.mark.parametrize("perm", generate_fixed_w_permutations(5))
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
+def test_permutations_5d_fixed_w(shape, perm, dtype, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
+    output_tensor = ttnn.permute(input_tensor, perm)
+    output_tensor = ttnn.to_torch(output_tensor)
+    torch_output = torch.permute(torch_tensor, perm)
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+@pytest.mark.skip("#16575 to_layout from tiled to RM fails on reshape")
+@pytest.mark.parametrize("shape", [[1, 9, 91, 7, 9]])
+@pytest.mark.parametrize("perm", [[0, 3, 4, 1, 2]])
+def test_permute_adversarial(shape, perm, device):
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=ttnn.bfloat16, device=device)
+    output_tensor = ttnn.permute(input_tensor, perm)
+    output_tensor = ttnn.to_torch(output_tensor)
+    torch_output = torch.permute(torch_tensor, perm)
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+@pytest.mark.parametrize(
+    "shape", [[1, 1, 32, 32], [2, 2, 32, 32], [1, 1, 64, 64], [2, 2, 64, 64], [32, 32, 32, 32], [32, 32, 64, 64]]
+)
+@pytest.mark.parametrize("perm", generate_fixed_w_permutations(4))
+def test_permute_4d_fixed_w(shape, perm, device):
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=ttnn.bfloat16, device=device)
+    output_tensor = ttnn.permute(input_tensor, perm)
+    output_tensor = ttnn.to_torch(output_tensor)
+    torch_output = torch.permute(torch_tensor, perm)
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
diff --git a/tt-train/sources/ttml/autograd/graph_utils.hpp b/tt-train/sources/ttml/autograd/graph_utils.hpp
index 1ec6d71b0c2..c810d19402b 100644
--- a/tt-train/sources/ttml/autograd/graph_utils.hpp
+++ b/tt-train/sources/ttml/autograd/graph_utils.hpp
@@ -15,6 +15,10 @@ std::vector<NodeId> get_links(Tensors&&... tensors) {
     std::vector<NodeId> links;
     links.reserve(sizeof...(Tensors));
     auto process_node = [&links](auto&& tensor) {
+        if (tensor == nullptr) {
+            return;
+        }
+
         const auto& node = tensor->get_node();
         if (node) {
             links.push_back(node.value());
diff --git a/tt-train/sources/ttml/autograd/module_base.cpp b/tt-train/sources/ttml/autograd/module_base.cpp
index b3f771b3688..48582235762 100644
--- a/tt-train/sources/ttml/autograd/module_base.cpp
+++ b/tt-train/sources/ttml/autograd/module_base.cpp
@@ -20,6 +20,22 @@ void ModuleBase::register_module(const ModuleBasePtr& module_ptr, const std::str
     }
 }
 
+void ModuleBase::override_tensor(const TensorPtr& tensor_ptr, const std::string& name) {
+    if (auto it = m_named_tensors.find(name); it != m_named_tensors.end()) {
+        it->second = tensor_ptr;
+    } else {
+        throw std::logic_error(fmt::format("Tensor with such name does not exist. Name {}", name));
+    }
+}
+
+void ModuleBase::override_module(const ModuleBasePtr& module_ptr, const std::string& name) {
+    if (auto it = m_named_modules.find(name); it != m_named_modules.end()) {
+        it->second = module_ptr;
+    } else {
+        throw std::logic_error(fmt::format("Module with such name does not exist. Name {}", name));
+    }
+}
+
 void ModuleBase::create_name(const std::string& name) {
     m_name = name;
 }
@@ -36,12 +52,24 @@ serialization::NamedParameters ModuleBase::parameters() const {
 
     std::unordered_set<std::string> modules_in_queue;
     modules_in_queue.insert(get_name());
+
+    // We need to store the address of the tensor to avoid duplicates
+    // as the same tensor can be registered in different modules
+    // and we need to store it only once
+    // Usecase: weight tying in transformers (embedding + output layer)
+    // std::uintptr_t is used to store the address of the tensor, and system dependent (32 or 64 bit)
+    std::unordered_set<std::uintptr_t> tensors_in_params;
+
     while (!modules_to_process.empty()) {
         auto [module_ptr, name_prefix] = modules_to_process.front();
         modules_to_process.pop();
 
         for (const auto& [tensor_name, tensor_ptr] : module_ptr->m_named_tensors) {
-            params.emplace(name_prefix + tensor_name, tensor_ptr);
+            auto tensor_ptr_address = reinterpret_cast<std::uintptr_t>(tensor_ptr.get());
+            if (!tensors_in_params.contains(tensor_ptr_address)) {
+                tensors_in_params.insert(tensor_ptr_address);
+                params.emplace(name_prefix + tensor_name, tensor_ptr);
+            }
         }
 
         for (const auto& [module_name, next_module_ptr] : module_ptr->m_named_modules) {
diff --git a/tt-train/sources/ttml/autograd/module_base.hpp b/tt-train/sources/ttml/autograd/module_base.hpp
index b2729bde46e..5cf53f5334f 100644
--- a/tt-train/sources/ttml/autograd/module_base.hpp
+++ b/tt-train/sources/ttml/autograd/module_base.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
-#include <unordered_map>
 
 #include "serialization/serializable.hpp"
 #include "tensor.hpp"
@@ -22,13 +22,19 @@ class ModuleBase {
     std::string m_name;
     RunMode m_run_mode = RunMode::TRAIN;
 
-    std::unordered_map<std::string, TensorPtr> m_named_tensors;
-    std::unordered_map<std::string, ModuleBasePtr> m_named_modules;
+    // Do not change map to unordered_map, as we need to keep order of iteration for serialization
+    // special case for weight tying in transformers
+    // for model save and load we need to make sure that stored/loaded name is the same between different runs
+    // unordered_map does not guarantee the order of iteration
+    std::map<std::string, TensorPtr> m_named_tensors;
+    std::map<std::string, ModuleBasePtr> m_named_modules;
 
 protected:
     void create_name(const std::string& name);
     void register_tensor(const TensorPtr& tensor_ptr, const std::string& name);
     void register_module(const ModuleBasePtr& module_ptr, const std::string& name);
+    void override_tensor(const TensorPtr& tensor_ptr, const std::string& name);
+    void override_module(const ModuleBasePtr& module_ptr, const std::string& name);
 
 public:
     ModuleBase() = default;
diff --git a/tt-train/sources/ttml/core/ttnn_all_includes.hpp b/tt-train/sources/ttml/core/ttnn_all_includes.hpp
index f9c88fe36f7..feb8e31a279 100644
--- a/tt-train/sources/ttml/core/ttnn_all_includes.hpp
+++ b/tt-train/sources/ttml/core/ttnn_all_includes.hpp
@@ -9,18 +9,18 @@
 #pragma GCC diagnostic ignored "-Wdeprecated-volatile"
 #pragma GCC diagnostic ignored "-Wdeprecated-this-capture"
 
-#include <common/bfloat16.hpp>                                                                     // NOLINT
-#include <distributed/mesh_device_view.hpp>                                                        // NOLINT
+#include <tt-metalium/bfloat16.hpp>                                                                     // NOLINT
+#include <tt-metalium/mesh_device_view.hpp>                                                        // NOLINT
 #include <hostdevcommon/common_values.hpp>                                                         // NOLINT
-#include <tt_metal/common/base_types.hpp>                                                          // NOLINT
-#include <tt_metal/common/math.hpp>                                                                // NOLINT
-#include <tt_metal/host_api.hpp>                                                                   // NOLINT
-#include <tt_metal/impl/device/device.hpp>                                                         // NOLINT
+#include <tt-metalium/base_types.hpp>                                                          // NOLINT
+#include <tt-metalium/math.hpp>                                                                // NOLINT
+#include <tt-metalium/host_api.hpp>                                                                   // NOLINT
+#include <tt-metalium/device_impl.hpp>                                                         // NOLINT
 #include <ttnn/core.hpp>                                                                           // NOLINT
-#include <ttnn/cpp/ttnn/operations/copy.hpp>                                                       // NOLINT
-#include <ttnn/cpp/ttnn/operations/core/core.hpp>                                                  // NOLINT
-#include <ttnn/cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax.hpp>                          // NOLINT
-#include <ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward.hpp>        // NOLINT
+#include <cpp/ttnn/operations/copy.hpp>                                                       // NOLINT
+#include <cpp/ttnn/operations/core/core.hpp>                                                  // NOLINT
+#include <cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax.hpp>                          // NOLINT
+#include <cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward.hpp>        // NOLINT
 #include <ttnn/device.hpp>                                                                         // NOLINT
 #include <ttnn/distributed/api.hpp>                                                                // NOLINT
 #include <ttnn/distributed/types.hpp>                                                              // NOLINT
diff --git a/tt-train/sources/ttml/models/gpt2.cpp b/tt-train/sources/ttml/models/gpt2.cpp
index 2ed90e057b9..6e1a35600b7 100644
--- a/tt-train/sources/ttml/models/gpt2.cpp
+++ b/tt-train/sources/ttml/models/gpt2.cpp
@@ -84,6 +84,7 @@ Transformer::Transformer(const TransformerConfig& config) {
         position_embedding_type == PositionalEmbeddingType::Trainable ? "Trainable" : "Fixed");
     fmt::print("    Runner type: {}\n", runner_type == RunnerType::Default ? "Default" : "Memory efficient");
     fmt::print("    Composite layernorm: {}\n", use_composite_layernorm);
+    fmt::print("    Weight tying: {}\n", config.weight_tying == WeightTyingType::Enabled ? "Enabled" : "Disabled");
 
     uint32_t vocab_size_divisible_by_32 = (vocab_size + 31) / 32 * 32;
     if (max_sequence_length % 32 != 0) {
@@ -129,6 +130,11 @@ Transformer::Transformer(const TransformerConfig& config) {
     }
     register_module(ln_fc, "ln_fc");
     register_module(fc, "fc");
+
+    if (config.weight_tying == WeightTyingType::Enabled) {
+        // tie weights between embedding and fc
+        tok_emb->set_weight(fc->get_weight());
+    }
 }
 
 ttml::autograd::TensorPtr Transformer::operator()(
@@ -175,6 +181,18 @@ PositionalEmbeddingType read_positional_embedding_type(const YAML::Node& config)
     }
 }
 
+WeightTyingType read_weight_tying_type(const YAML::Node& config) {
+    auto weight_tying_str = config["weight_tying"].as<std::string>("disabled");
+    if (weight_tying_str == "disabled") {
+        return WeightTyingType::Disabled;
+    } else if (weight_tying_str == "enabled") {
+        return WeightTyingType::Enabled;
+    } else {
+        throw std::runtime_error(fmt::format(
+            "Unknown weight tying type: {}. Supported weight tying types [disabled, enabled]", weight_tying_str));
+    }
+}
+
 TransformerConfig read_config(const YAML::Node& config) {
     TransformerConfig transformer_config;
     transformer_config.num_heads = config["num_heads"].as<uint32_t>();
@@ -185,6 +203,7 @@ TransformerConfig read_config(const YAML::Node& config) {
     transformer_config.max_sequence_length = config["max_sequence_length"].as<uint32_t>();
     transformer_config.positional_embedding_type = read_positional_embedding_type(config);
     transformer_config.runner_type = read_runner_type(config);
+    transformer_config.weight_tying = read_weight_tying_type(config);
 
     if (auto experimental_config = config["experimental"]) {
         transformer_config.experimental.use_composite_layernorm =
diff --git a/tt-train/sources/ttml/models/gpt2.hpp b/tt-train/sources/ttml/models/gpt2.hpp
index 0ff2fe8215f..2c555888c8c 100644
--- a/tt-train/sources/ttml/models/gpt2.hpp
+++ b/tt-train/sources/ttml/models/gpt2.hpp
@@ -23,6 +23,11 @@ enum class RunnerType {
     Default,
 };
 
+enum class WeightTyingType {
+    Disabled,
+    Enabled,
+};
+
 struct TransformerConfig {
     uint32_t num_heads = 6;
     uint32_t embedding_dim = 384;
@@ -31,6 +36,7 @@ struct TransformerConfig {
     uint32_t vocab_size = 256;
     uint32_t max_sequence_length = 256;
     RunnerType runner_type = RunnerType::Default;
+    WeightTyingType weight_tying = WeightTyingType::Disabled;
     PositionalEmbeddingType positional_embedding_type = PositionalEmbeddingType::Trainable;
 
     struct Experimental {
diff --git a/tt-train/sources/ttml/modules/embedding_module.cpp b/tt-train/sources/ttml/modules/embedding_module.cpp
index ace24ab6cea..35345ab575c 100644
--- a/tt-train/sources/ttml/modules/embedding_module.cpp
+++ b/tt-train/sources/ttml/modules/embedding_module.cpp
@@ -45,4 +45,13 @@ autograd::TensorPtr Embedding::operator()(const autograd::TensorPtr& tensor) {
     return ops::embedding_op(tensor, m_weight);
 }
 
+void Embedding::set_weight(const autograd::TensorPtr& weight) {
+    m_weight = weight;
+    override_tensor(m_weight, "weight");
+}
+
+autograd::TensorPtr Embedding::get_weight() const {
+    return m_weight;
+}
+
 }  // namespace ttml::modules
diff --git a/tt-train/sources/ttml/modules/embedding_module.hpp b/tt-train/sources/ttml/modules/embedding_module.hpp
index 04f826cbe57..ee95f749e08 100644
--- a/tt-train/sources/ttml/modules/embedding_module.hpp
+++ b/tt-train/sources/ttml/modules/embedding_module.hpp
@@ -14,6 +14,8 @@ class Embedding : public autograd::ModuleBase {
 
 public:
     Embedding(uint32_t num_embeddings, uint32_t embedding_dim);
+    void set_weight(const autograd::TensorPtr& weight);
+    [[nodiscard]] autograd::TensorPtr get_weight() const;
 
     [[nodiscard]] autograd::TensorPtr operator()(const autograd::TensorPtr& tensor);
 };
diff --git a/tt-train/sources/ttml/modules/linear_module.cpp b/tt-train/sources/ttml/modules/linear_module.cpp
index ad8f3522e54..e7e8412288d 100644
--- a/tt-train/sources/ttml/modules/linear_module.cpp
+++ b/tt-train/sources/ttml/modules/linear_module.cpp
@@ -26,15 +26,24 @@ void LinearLayer::initialize_tensors(uint32_t in_features, uint32_t out_features
 }
 
 LinearLayer::LinearLayer(uint32_t in_features, uint32_t out_features, bool has_bias) {
-    initialize_tensors(in_features, out_features);
+    initialize_tensors(in_features, out_features, has_bias);
 
     create_name("linear");
     register_tensor(m_weight, "weight");
-    if (has_bias) {
+    if (m_bias != nullptr) {
         register_tensor(m_bias, "bias");
     }
 }
 
+autograd::TensorPtr LinearLayer::get_weight() const {
+    return m_weight;
+}
+
+void LinearLayer::set_weight(const autograd::TensorPtr& weight) {
+    m_weight = weight;
+    override_tensor(m_weight, "weight");
+}
+
 autograd::TensorPtr LinearLayer::operator()(const autograd::TensorPtr& tensor) {
     return ops::linear_op(tensor, m_weight, m_bias);
 }
diff --git a/tt-train/sources/ttml/modules/linear_module.hpp b/tt-train/sources/ttml/modules/linear_module.hpp
index 7ab28518bdd..09c92361bc1 100644
--- a/tt-train/sources/ttml/modules/linear_module.hpp
+++ b/tt-train/sources/ttml/modules/linear_module.hpp
@@ -24,6 +24,9 @@ class LinearLayer : public autograd::ModuleBase {
 public:
     LinearLayer(uint32_t in_features, uint32_t out_features, bool has_bias = true);
 
+    autograd::TensorPtr get_weight() const;
+    void set_weight(const autograd::TensorPtr& weight);
+
     [[nodiscard]] autograd::TensorPtr operator()(const autograd::TensorPtr& tensor);
 };
 
diff --git a/tt-train/sources/ttml/ops/linear_op.cpp b/tt-train/sources/ttml/ops/linear_op.cpp
index 53a22fd578f..e3a66f94e05 100644
--- a/tt-train/sources/ttml/ops/linear_op.cpp
+++ b/tt-train/sources/ttml/ops/linear_op.cpp
@@ -115,7 +115,8 @@ autograd::TensorPtr linear_op(
     out->set_value(ttnn::linear(
         tensor->get_value(),
         weight->get_value(),
-        bias->get_value(),
+        bias != nullptr ? std::optional<tt::tt_metal::Tensor>(bias->get_value())
+                        : std::optional<tt::tt_metal::Tensor>(std::nullopt),
         /* transpose_a */ false,
         /* tranpose_b */ true,
         /* memory_config */ std::nullopt,
diff --git a/tt-train/sources/ttml/serialization/serialization.cpp b/tt-train/sources/ttml/serialization/serialization.cpp
index 401b96a26bf..aa1f6aa9cb6 100644
--- a/tt-train/sources/ttml/serialization/serialization.cpp
+++ b/tt-train/sources/ttml/serialization/serialization.cpp
@@ -59,11 +59,22 @@ void write_ttnn_tensor(MsgPackFile& file, std::string_view name, const tt::tt_me
     file.put(std::string(name) + "/layout", static_cast<int>(layout));
     file.put(std::string(name) + "/storage_type", static_cast<int>(storage_type));
 
+    // we currently assume that there are two types of runs: single device and DDP
+    // once we decide to use other parallelization techniques (tensor parallel, FSDP) we need to update this code
     if (data_type == tt::tt_metal::DataType::BFLOAT16) {
-        auto data = ttml::core::to_vector<float>(tensor);
+        auto* device = &ttml::autograd::ctx().get_device();
+        ttml::core::MeshToXTensorVariant<float> composer = ttml::core::VectorMeshToXTensor<float>(device->shape());
+        auto data_all_devices = ttml::core::to_xtensor<float>(tensor, composer);
+        // pick weights from first device
+        auto data = data_all_devices.front();
         file.put(std::string(name) + "/data", std::span<const float>(data.data(), data.size()));
     } else if (data_type == tt::tt_metal::DataType::UINT32) {
-        auto data = ttml::core::to_vector<uint32_t>(tensor);
+        auto* device = &ttml::autograd::ctx().get_device();
+        ttml::core::MeshToXTensorVariant<uint32_t> composer =
+            ttml::core::VectorMeshToXTensor<uint32_t>(device->shape());
+        auto data_all_devices = ttml::core::to_xtensor<uint32_t>(tensor, composer);
+        // pick weights from first device
+        auto data = data_all_devices.front();
         file.put(std::string(name) + "/data", std::span<const uint32_t>(data.data(), data.size()));
     } else {
         throw std::runtime_error(fmt::format("Unsupported data type: {}", magic_enum::enum_name(data_type)));
diff --git a/tt-train/tests/model/weight_tying_test.cpp b/tt-train/tests/model/weight_tying_test.cpp
new file mode 100644
index 00000000000..955defd7b6f
--- /dev/null
+++ b/tt-train/tests/model/weight_tying_test.cpp
@@ -0,0 +1,158 @@
+// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "autograd/auto_context.hpp"
+#include "autograd/module_base.hpp"
+#include "core/tt_tensor_utils.hpp"
+#include "modules/embedding_module.hpp"
+#include "modules/linear_module.hpp"
+#include "ops/losses.hpp"
+#include "ops/unary_ops.hpp"
+#include "optimizers/adamw.hpp"
+
+class ModelFC : public ttml::autograd::ModuleBase {
+    std::shared_ptr<ttml::modules::LinearLayer> m_fc1;
+    std::shared_ptr<ttml::modules::LinearLayer> m_fc2;
+
+public:
+    ModelFC() {
+        m_fc1 = std::make_shared<ttml::modules::LinearLayer>(64, 64);
+        m_fc2 = std::make_shared<ttml::modules::LinearLayer>(64, 64);
+        create_name("ModelFC");
+
+        register_module(m_fc1, "fc1");
+        register_module(m_fc2, "fc2");
+
+        m_fc1->set_weight(m_fc2->get_weight());
+    }
+
+    ttml::autograd::TensorPtr operator()(const ttml::autograd::TensorPtr& x) {
+        auto out = (*m_fc1)(x);
+        out = ttml::ops::relu(out);
+        out = (*m_fc2)(out);
+        return out;
+    }
+
+    ttml::autograd::TensorPtr get_fc1_weight() {
+        return m_fc1->get_weight();
+    }
+
+    ttml::autograd::TensorPtr get_fc2_weight() {
+        return m_fc2->get_weight();
+    }
+};
+
+class LanguageModel : public ttml::autograd::ModuleBase {
+    std::shared_ptr<ttml::modules::LinearLayer> m_fc1;
+    std::shared_ptr<ttml::modules::Embedding> m_emb;
+
+public:
+    LanguageModel() {
+        m_fc1 = std::make_shared<ttml::modules::LinearLayer>(128, 64);
+        m_emb = std::make_shared<ttml::modules::Embedding>(64, 128);
+        create_name("LanguageModel");
+
+        register_module(m_fc1, "fc1");
+        register_module(m_emb, "emb");
+
+        m_fc1->set_weight(m_emb->get_weight());
+    }
+};
+
+class WeightTyingTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        ttml::autograd::ctx().open_device();
+    }
+
+    void TearDown() override {
+        ttml::autograd::ctx().reset_graph();
+        ttml::autograd::ctx().close_device();
+    }
+};
+
+TEST_F(WeightTyingTest, ModelFC) {
+    auto model = ModelFC();
+    auto params = model.parameters();
+    assert(params.size() == 3U);
+
+    std::vector<std::string> names;
+    names.reserve(params.size());
+
+    for (const auto& [name, tensor] : params) {
+        names.push_back(name);
+    }
+
+    std::sort(names.begin(), names.end());
+    EXPECT_EQ(names[0], "ModelFC/fc1/bias");
+    EXPECT_EQ(names[1], "ModelFC/fc1/weight");
+    EXPECT_EQ(names[2], "ModelFC/fc2/bias");
+
+    const size_t batch_size = 64;
+    const size_t num_features = 64;
+    const size_t output_features = 64;
+    std::vector<float> features;
+    features.reserve(batch_size * num_features);
+    for (size_t i = 0; i < batch_size; ++i) {
+        for (size_t j = 0; j < num_features; ++j) {
+            features.push_back(static_cast<float>(i) * 0.1F);
+        }
+    }
+
+    std::vector<float> targets;
+    for (size_t i = 0; i < batch_size; ++i) {
+        for (int j = 0; j < output_features; ++j) {
+            targets.push_back(static_cast<float>(i) * 0.1F);
+        }
+    }
+
+    auto* device = &ttml::autograd::ctx().get_device();
+    auto data_tensor = ttml::autograd::create_tensor(
+        ttml::core::from_vector(features, ttml::core::create_shape({batch_size, 1, 1, num_features}), device));
+
+    auto targets_tensor = ttml::autograd::create_tensor(
+        ttml::core::from_vector(targets, ttml::core::create_shape({batch_size, 1, 1, output_features}), device));
+
+    auto optimizer_params = ttml::optimizers::AdamWConfig();
+    optimizer_params.lr = 0.01F;
+    auto optimizer = ttml::optimizers::AdamW(model.parameters(), optimizer_params);
+
+    for (uint32_t step = 0; step < 5U; ++step) {
+        optimizer.zero_grad();
+        auto output = model(data_tensor);
+        auto loss = ttml::ops::mse_loss(output, targets_tensor);
+        loss->backward();
+        optimizer.step();
+    }
+
+    auto fc1_weight = model.get_fc1_weight();
+    auto fc2_weight = model.get_fc2_weight();
+
+    auto fc1_weight_data = ttml::core::to_vector(fc1_weight->get_value());
+    auto fc2_weight_data = ttml::core::to_vector(fc2_weight->get_value());
+
+    // check that weights coincide
+    EXPECT_EQ(fc1_weight_data.size(), fc2_weight_data.size());
+    EXPECT_EQ(fc1_weight_data, fc2_weight_data);
+};
+
+TEST_F(WeightTyingTest, LanguageModel) {
+    auto model = LanguageModel();
+    auto params = model.parameters();
+    assert(params.size() == 2U);
+
+    std::vector<std::string> names;
+    names.reserve(params.size());
+    for (const auto& [name, tensor] : params) {
+        names.push_back(name);
+    }
+    std::sort(names.begin(), names.end());
+
+    EXPECT_EQ(names[0], "LanguageModel/emb/weight");
+    EXPECT_EQ(names[1], "LanguageModel/fc1/bias");
+};
diff --git a/tt_fabric/CMakeLists.txt b/tt_fabric/CMakeLists.txt
index 383e9ab3a00..34add9c0350 100644
--- a/tt_fabric/CMakeLists.txt
+++ b/tt_fabric/CMakeLists.txt
@@ -9,6 +9,8 @@ target_sources(
         mesh_graph.cpp
 )
 
+target_include_directories(tt_fabric PRIVATE .)
+
 target_link_libraries(
     tt_fabric
     PRIVATE
diff --git a/tt_fabric/control_plane.cpp b/tt_fabric/control_plane.cpp
index eb1df081dcd..eaa326a06d1 100644
--- a/tt_fabric/control_plane.cpp
+++ b/tt_fabric/control_plane.cpp
@@ -2,12 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_fabric/control_plane.hpp"
-#include "tt_metal/hw/inc/wormhole/eth_l1_address_map.h"
+#include <tt-metalium/hal.hpp>
+#include "control_plane.hpp"
 #include <queue>
 
 namespace tt::tt_fabric {
-static_assert(eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_SIZE == sizeof(fabric_router_l1_config_t));
 
 // Get the physical chip ids for a mesh
 std::unordered_map<chip_id_t, std::vector<CoreCoord>> get_ethernet_cores_grouped_by_connected_chips(chip_id_t chip_id) {
@@ -469,11 +468,17 @@ void ControlPlane::write_routing_tables_to_chip(mesh_id_t mesh_id, chip_id_t chi
             CoreCoord virtual_eth_core =
                 tt::Cluster::instance().get_virtual_eth_core_from_channel(physical_chip_id, eth_chan);
 
+            TT_ASSERT(
+                tt_metal::hal.get_dev_size(
+                    tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt_metal::HalL1MemAddrType::FABRIC_ROUTER_CONFIG) ==
+                    sizeof(tt::tt_fabric::fabric_router_l1_config_t),
+                "ControlPlane: Fabric router config size mismatch");
             tt::Cluster::instance().write_core(
                 (void*)&fabric_router_config,
                 sizeof(tt::tt_fabric::fabric_router_l1_config_t),
                 tt_cxy_pair(physical_chip_id, virtual_eth_core),
-                eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE,
+                tt_metal::hal.get_dev_addr(
+                    tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt_metal::HalL1MemAddrType::FABRIC_ROUTER_CONFIG),
                 false);
         }
     }
diff --git a/tt_fabric/control_plane.hpp b/tt_fabric/control_plane.hpp
index a5ca43ea52b..e9faa1377c3 100644
--- a/tt_fabric/control_plane.hpp
+++ b/tt_fabric/control_plane.hpp
@@ -4,10 +4,10 @@
 
 #pragma once
 
-#include "tt_fabric/routing_table_generator.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_fabric/hw/inc/routing_table.h"
+#include "routing_table_generator.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/rtoptions.hpp>
+#include "hw/inc/routing_table.h"
 
 namespace tt::tt_fabric {
 
diff --git a/tt_fabric/hw/inc/routing_table.h b/tt_fabric/hw/inc/routing_table.h
index 5de2c5649a0..9d836f70394 100644
--- a/tt_fabric/hw/inc/routing_table.h
+++ b/tt_fabric/hw/inc/routing_table.h
@@ -11,7 +11,7 @@
 #include <stdint.h>
 
 #if defined(KERNEL_BUILD) || defined(FW_BUILD)
-#include "tt_metal/hw/inc/risc_attribs.h"
+#include <tt-metalium/risc_attribs.h>
 #else
 #define tt_l1_ptr
 #define tt_reg_ptr
diff --git a/tt_fabric/hw/inc/tt_fabric.h b/tt_fabric/hw/inc/tt_fabric.h
index 03b341d8a14..d5196c6605a 100644
--- a/tt_fabric/hw/inc/tt_fabric.h
+++ b/tt_fabric/hw/inc/tt_fabric.h
@@ -5,13 +5,13 @@
 #pragma once
 
 #include "risc_attribs.h"
-#include "tt_metal/hostdevcommon/api/hostdevcommon/common_values.hpp"
+#include <hostdevcommon/common_values.hpp>
 #include "dataflow_api.h"
 #include "noc_overlay_parameters.h"
 #include "ethernet/dataflow_api.h"
-#include "tt_fabric/hw/inc/routing_table.h"
-#include "tt_fabric/hw/inc/tt_fabric_interface.h"
-#include "tt_fabric/hw/inc/eth_chan_noc_mapping.h"
+#include "hw/inc/routing_table.h"
+#include "hw/inc/tt_fabric_interface.h"
+#include "hw/inc/eth_chan_noc_mapping.h"
 
 constexpr ProgrammableCoreType fd_core_type = static_cast<ProgrammableCoreType>(FD_CORE_TYPE);
 
diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_fabric/hw/inc/tt_fabric_api.h
index e583d6be669..6978f58ff9d 100644
--- a/tt_fabric/hw/inc/tt_fabric_api.h
+++ b/tt_fabric/hw/inc/tt_fabric_api.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "risc_attribs.h"
-#include "tt_metal/hostdevcommon/common_values.hpp"
+#include <hostdevcommon/common_values.hpp>
 #include "dataflow_api.h"
 #include "noc_overlay_parameters.h"
 #include "ethernet/dataflow_api.h"
diff --git a/tt_fabric/impl/kernels/tt_fabric_router.cpp b/tt_fabric/impl/kernels/tt_fabric_router.cpp
index 2b15961f23b..3822adb87ae 100644
--- a/tt_fabric/impl/kernels/tt_fabric_router.cpp
+++ b/tt_fabric/impl/kernels/tt_fabric_router.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // clang-format off
-#include "tt_metal/hw/inc/dataflow_api.h"
-#include "tt_fabric/hw/inc/tt_fabric.h"
+#include <tt-metalium/dataflow_api.h>
+#include "hw/inc/tt_fabric.h"
 // clang-format on
 
 router_state_t router_state __attribute__((aligned(16)));
diff --git a/tt_fabric/mesh_graph.cpp b/tt_fabric/mesh_graph.cpp
index 737cbe5ee70..a5680b616f1 100644
--- a/tt_fabric/mesh_graph.cpp
+++ b/tt_fabric/mesh_graph.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_fabric/mesh_graph.hpp"
+#include "mesh_graph.hpp"
 
 #include <fstream>
 #include <iostream>
diff --git a/tt_fabric/mesh_graph.hpp b/tt_fabric/mesh_graph.hpp
index ff195700337..414b8947527 100644
--- a/tt_fabric/mesh_graph.hpp
+++ b/tt_fabric/mesh_graph.hpp
@@ -10,9 +10,9 @@
 #include <vector>
 #include <magic_enum/magic_enum.hpp>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/llrt/tt_cluster.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/assert.hpp>
+#include <tt-metalium/tt_cluster.hpp>
+#include <tt-metalium/reflection.hpp>
 
 namespace tt::tt_fabric {
 struct ChipSpec {
diff --git a/tt_fabric/routing_table_generator.cpp b/tt_fabric/routing_table_generator.cpp
index 3c6af49ac18..6569b287d1f 100644
--- a/tt_fabric/routing_table_generator.cpp
+++ b/tt_fabric/routing_table_generator.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_fabric/routing_table_generator.hpp"
+#include "routing_table_generator.hpp"
 
 #include <queue>
 #include <memory>
diff --git a/tt_fabric/routing_table_generator.hpp b/tt_fabric/routing_table_generator.hpp
index 52e82e717f2..0034ad05a0d 100644
--- a/tt_fabric/routing_table_generator.hpp
+++ b/tt_fabric/routing_table_generator.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 #include <magic_enum/magic_enum.hpp>
-#include "tt_fabric/mesh_graph.hpp"
+#include "mesh_graph.hpp"
 
 namespace tt::tt_fabric {
 
diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt
index 2d307ffe23f..80106dff1a6 100644
--- a/tt_metal/CMakeLists.txt
+++ b/tt_metal/CMakeLists.txt
@@ -12,22 +12,28 @@ target_sources(
 target_link_libraries(
     tt_metal
     PUBLIC
-        Metalium::Metal::Impl
-        Metalium::Metal::STL
         umd::device
-        metal_common_libs
         magic_enum
         fmt::fmt-header-only
         span
-        Tracy::TracyClient
-    PRIVATE
+        TracyClient
+        nlohmann_json::nlohmann_json
         TT::Metalium::HostDevCommon
+        Reflect::Reflect
+        Taskflow::Taskflow
+    PRIVATE
+        Metalium::Metal::Impl
+        Metalium::Metal::STL
+        metal_common_libs
         profiler
         common
         jit_build
         llrt
         detail
         distributed
+        HAL::grayskull
+        HAL::wormhole
+        HAL::blackhole
 )
 
 target_precompile_headers(
@@ -45,11 +51,9 @@ target_precompile_headers(
 target_include_directories(
     tt_metal
     PUBLIC
-        ${PROJECT_SOURCE_DIR}
-        ${CMAKE_CURRENT_SOURCE_DIR}
-        include
+        "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/api>"
     PRIVATE
-        common # FIXME: Clean up the tests and remove this
+        api/tt-metalium # FIXME: Re-home the tests and remove this
 )
 target_compile_options(tt_metal PUBLIC -Wno-int-to-pointer-cast)
 add_dependencies(tt_metal hw_toolchain)
@@ -63,6 +67,17 @@ set_target_properties(
             "${PROJECT_BINARY_DIR}/lib;${PROJECT_BINARY_DIR}/obj"
 )
 
+if(BUILD_PROGRAMMING_EXAMPLES)
+    add_subdirectory(programming_examples)
+endif()
+
+# Allow internal files to access the public API "by default" and without the
+# scoping that external consumers must use.  Scaoping may still be used if desired.
+include_directories(
+    api
+    api/tt-metalium
+)
+
 add_subdirectory(hw)
 add_subdirectory(hostdevcommon)
 add_subdirectory(common)
@@ -73,7 +88,3 @@ add_subdirectory(impl)
 add_subdirectory(detail)
 add_subdirectory(distributed)
 add_subdirectory(tt_stl)
-
-if(BUILD_PROGRAMMING_EXAMPLES)
-    add_subdirectory(programming_examples)
-endif()
diff --git a/tt_metal/tt_stl/aligned_allocator.hpp b/tt_metal/api/tt-metalium/aligned_allocator.hpp
similarity index 100%
rename from tt_metal/tt_stl/aligned_allocator.hpp
rename to tt_metal/api/tt-metalium/aligned_allocator.hpp
diff --git a/tt_metal/impl/allocator/allocator.hpp b/tt_metal/api/tt-metalium/allocator.hpp
similarity index 97%
rename from tt_metal/impl/allocator/allocator.hpp
rename to tt_metal/api/tt-metalium/allocator.hpp
index 647e4f3324f..acbe2fed8bc 100644
--- a/tt_metal/impl/allocator/allocator.hpp
+++ b/tt_metal/api/tt-metalium/allocator.hpp
@@ -10,10 +10,10 @@
 #include <unordered_set>
 
 #include "allocator_types.hpp"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp"
-#include "llrt/hal.hpp"
+#include "assert.hpp"
+#include "core_coord.hpp"
+#include "allocator_algorithm.hpp"
+#include "hal.hpp"
 
 namespace tt {
 
diff --git a/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp b/tt_metal/api/tt-metalium/allocator_algorithm.hpp
similarity index 96%
rename from tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp
rename to tt_metal/api/tt-metalium/allocator_algorithm.hpp
index 0d64cd0d9cb..b270481c65b 100644
--- a/tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp
+++ b/tt_metal/api/tt-metalium/allocator_algorithm.hpp
@@ -9,8 +9,8 @@
 #include <vector>
 #include "hostdevcommon/common_values.hpp"
 
-#include "tt_metal/impl/allocator/allocator_types.hpp"
-#include "llrt/hal.hpp"
+#include "allocator_types.hpp"
+#include "hal.hpp"
 
 namespace tt {
 
diff --git a/tt_metal/impl/allocator/allocator_types.hpp b/tt_metal/api/tt-metalium/allocator_types.hpp
similarity index 98%
rename from tt_metal/impl/allocator/allocator_types.hpp
rename to tt_metal/api/tt-metalium/allocator_types.hpp
index 7f30e4d1cf1..19497b83d4e 100644
--- a/tt_metal/impl/allocator/allocator_types.hpp
+++ b/tt_metal/api/tt-metalium/allocator_types.hpp
@@ -7,7 +7,7 @@
 #include <vector>
 #include <cstdlib>
 #include <functional>
-#include "common/core_coord.hpp"
+#include "core_coord.hpp"
 #include "hostdevcommon/common_values.hpp"
 
 namespace tt::tt_metal {
diff --git a/tt_metal/common/assert.hpp b/tt_metal/api/tt-metalium/assert.hpp
similarity index 99%
rename from tt_metal/common/assert.hpp
rename to tt_metal/api/tt-metalium/assert.hpp
index c0ec41cd8c7..9c6a31f35fd 100644
--- a/tt_metal/common/assert.hpp
+++ b/tt_metal/api/tt-metalium/assert.hpp
@@ -13,7 +13,7 @@
 #include <sstream>
 #include <vector>
 
-#include "tt_metal/common/logger.hpp"
+#include "logger.hpp"
 
 namespace tt {
 template <typename A, typename B>
diff --git a/tt_metal/common/base.hpp b/tt_metal/api/tt-metalium/base.hpp
similarity index 100%
rename from tt_metal/common/base.hpp
rename to tt_metal/api/tt-metalium/base.hpp
diff --git a/tt_metal/common/base_types.hpp b/tt_metal/api/tt-metalium/base_types.hpp
similarity index 100%
rename from tt_metal/common/base_types.hpp
rename to tt_metal/api/tt-metalium/base_types.hpp
diff --git a/tt_metal/impl/allocator/basic_allocator.hpp b/tt_metal/api/tt-metalium/basic_allocator.hpp
similarity index 88%
rename from tt_metal/impl/allocator/basic_allocator.hpp
rename to tt_metal/api/tt-metalium/basic_allocator.hpp
index 2b0480a0a1c..0b35c950607 100644
--- a/tt_metal/impl/allocator/basic_allocator.hpp
+++ b/tt_metal/api/tt-metalium/basic_allocator.hpp
@@ -9,7 +9,7 @@
 #include <unordered_map>
 #include <memory>
 
-#include "tt_metal/impl/allocator/allocator.hpp"
+#include "allocator.hpp"
 
 namespace tt {
 
diff --git a/tt_metal/common/bfloat16.hpp b/tt_metal/api/tt-metalium/bfloat16.hpp
similarity index 99%
rename from tt_metal/common/bfloat16.hpp
rename to tt_metal/api/tt-metalium/bfloat16.hpp
index 2068883bbb0..123483a6301 100644
--- a/tt_metal/common/bfloat16.hpp
+++ b/tt_metal/api/tt-metalium/bfloat16.hpp
@@ -9,7 +9,7 @@
 #include <random>
 #include <vector>
 
-#include "tt_metal/common/assert.hpp"
+#include "assert.hpp"
 
 #include "tracy/Tracy.hpp"
 
diff --git a/tt_metal/common/bfloat4.hpp b/tt_metal/api/tt-metalium/bfloat4.hpp
similarity index 98%
rename from tt_metal/common/bfloat4.hpp
rename to tt_metal/api/tt-metalium/bfloat4.hpp
index b94fdd2d9d0..601a779618d 100644
--- a/tt_metal/common/bfloat4.hpp
+++ b/tt_metal/api/tt-metalium/bfloat4.hpp
@@ -9,10 +9,10 @@
 #include <vector>
 #include <immintrin.h>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/blockfloat_common.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "assert.hpp"
+#include "blockfloat_common.hpp"
+#include "tt_backend_api_types.hpp"
+#include "span.hpp"
 #include "tracy/Tracy.hpp"
 
 // TODO: empty struct to facilitate Tensor template logic. Reconsider how/why templating is supported in Tensor
diff --git a/tt_metal/common/bfloat8.hpp b/tt_metal/api/tt-metalium/bfloat8.hpp
similarity index 98%
rename from tt_metal/common/bfloat8.hpp
rename to tt_metal/api/tt-metalium/bfloat8.hpp
index e37405f8c64..fb3f288d901 100644
--- a/tt_metal/common/bfloat8.hpp
+++ b/tt_metal/api/tt-metalium/bfloat8.hpp
@@ -9,10 +9,10 @@
 #include <vector>
 #include <immintrin.h>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/blockfloat_common.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "assert.hpp"
+#include "blockfloat_common.hpp"
+#include "tt_backend_api_types.hpp"
+#include "span.hpp"
 #include "tracy/Tracy.hpp"
 
 // TODO: empty struct to facilitate Tensor template logic. Reconsider how/why templating is supported in Tensor
diff --git a/tt_metal/common/blockfloat_common.hpp b/tt_metal/api/tt-metalium/blockfloat_common.hpp
similarity index 98%
rename from tt_metal/common/blockfloat_common.hpp
rename to tt_metal/api/tt-metalium/blockfloat_common.hpp
index c378fc83f60..c094c1dec96 100644
--- a/tt_metal/common/blockfloat_common.hpp
+++ b/tt_metal/api/tt-metalium/blockfloat_common.hpp
@@ -9,11 +9,11 @@
 #include <vector>
 #include <immintrin.h>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "assert.hpp"
+#include "tt_backend_api_types.hpp"
 #include "tracy/Tracy.hpp"
-#include "tt_metal/impl/tile/tile.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "tile.hpp"
+#include "span.hpp"
 
 inline uint8_t get_max_exp(const std::vector<uint32_t>& vec, bool is_exp_a) {
     TT_ASSERT(vec.size() == 16);
diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/api/tt-metalium/buffer.hpp
similarity index 97%
rename from tt_metal/impl/buffers/buffer.hpp
rename to tt_metal/api/tt-metalium/buffer.hpp
index 0998a714225..958ad4fe96e 100644
--- a/tt_metal/impl/buffers/buffer.hpp
+++ b/tt_metal/api/tt-metalium/buffer.hpp
@@ -17,17 +17,17 @@
 #include <variant>
 #include <vector>
 
-#include "common/bfloat16.hpp"
-#include "common/core_coord.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
+#include "bfloat16.hpp"
+#include "core_coord.hpp"
+#include "buffer_constants.hpp"
+#include "sub_device_types.hpp"
 #include "umd/device/tt_soc_descriptor.h"
 #include "umd/device/types/xy_pair.h"
-#include "tt_metal/tt_stl/concepts.hpp"
-#include "tt_metal/common/assert.hpp"
+#include "concepts.hpp"
+#include "assert.hpp"
 #include <nlohmann/json.hpp>
 
-#include "llrt/hal.hpp"
+#include "hal.hpp"
 
 namespace tt::tt_metal {
 inline namespace v0 {
diff --git a/tt_metal/impl/buffers/buffer_constants.hpp b/tt_metal/api/tt-metalium/buffer_constants.hpp
similarity index 100%
rename from tt_metal/impl/buffers/buffer_constants.hpp
rename to tt_metal/api/tt-metalium/buffer_constants.hpp
diff --git a/tt_metal/jit_build/build.hpp b/tt_metal/api/tt-metalium/build.hpp
similarity index 95%
rename from tt_metal/jit_build/build.hpp
rename to tt_metal/api/tt-metalium/build.hpp
index 3b6aab74919..05186e18cad 100644
--- a/tt_metal/jit_build/build.hpp
+++ b/tt_metal/api/tt-metalium/build.hpp
@@ -7,16 +7,16 @@
 #include <string>
 #include <future>
 
-#include "common/tt_backend_api_types.hpp"
-#include "common/executor.hpp"
-#include "common/utils.hpp"
-#include "common/core_coord.hpp"
-#include "jit_build/data_format.hpp"
-#include "jit_build/settings.hpp"
+#include "tt_backend_api_types.hpp"
+#include "executor.hpp"
+#include "utils.hpp"
+#include "core_coord.hpp"
+#include "data_format.hpp"
+#include "settings.hpp"
 #include "hostdevcommon/common_values.hpp"
 #include "tracy/Tracy.hpp"
-#include "tt_metal/tt_stl/aligned_allocator.hpp"
-#include "llrt/rtoptions.hpp"
+#include "aligned_allocator.hpp"
+#include "rtoptions.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/buffers/circular_buffer.hpp b/tt_metal/api/tt-metalium/circular_buffer.hpp
similarity index 95%
rename from tt_metal/impl/buffers/circular_buffer.hpp
rename to tt_metal/api/tt-metalium/circular_buffer.hpp
index cfbf13ce69e..c3a455b610d 100644
--- a/tt_metal/impl/buffers/circular_buffer.hpp
+++ b/tt_metal/api/tt-metalium/circular_buffer.hpp
@@ -4,9 +4,9 @@
 
 #pragma once
 
-#include "common/core_coord.hpp"
-#include "common/tt_backend_api_types.hpp"
-#include "tt_metal/impl/buffers/circular_buffer_types.hpp"
+#include "core_coord.hpp"
+#include "tt_backend_api_types.hpp"
+#include "circular_buffer_types.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/hw/inc/circular_buffer_constants.h b/tt_metal/api/tt-metalium/circular_buffer_constants.h
similarity index 100%
rename from tt_metal/hw/inc/circular_buffer_constants.h
rename to tt_metal/api/tt-metalium/circular_buffer_constants.h
diff --git a/tt_metal/impl/buffers/circular_buffer_types.hpp b/tt_metal/api/tt-metalium/circular_buffer_types.hpp
similarity index 94%
rename from tt_metal/impl/buffers/circular_buffer_types.hpp
rename to tt_metal/api/tt-metalium/circular_buffer_types.hpp
index 2a174c92813..73da09c2603 100644
--- a/tt_metal/impl/buffers/circular_buffer_types.hpp
+++ b/tt_metal/api/tt-metalium/circular_buffer_types.hpp
@@ -11,12 +11,12 @@
 #include <optional>
 #include <unordered_set>
 
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/tile/tile.hpp"
+#include "logger.hpp"
+#include "tt_backend_api_types.hpp"
+#include "buffer.hpp"
+#include "tile.hpp"
 
-#include "tt_metal/hw/inc/circular_buffer_constants.h"
+#include "circular_buffer_constants.h"
 
 namespace tt::tt_metal {
 inline namespace v0 {
diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/api/tt-metalium/command_queue.hpp
similarity index 98%
rename from tt_metal/impl/dispatch/command_queue.hpp
rename to tt_metal/api/tt-metalium/command_queue.hpp
index 6dda1daef62..12190b1de37 100644
--- a/tt_metal/impl/dispatch/command_queue.hpp
+++ b/tt_metal/api/tt-metalium/command_queue.hpp
@@ -14,14 +14,14 @@
 #include <utility>
 #include <vector>
 
-#include "common/env_lib.hpp"
-#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
-#include "tt_metal/impl/dispatch/device_command.hpp"
-#include "tt_metal/impl/dispatch/lock_free_queue.hpp"
-#include "tt_metal/impl/dispatch/program_command_sequence.hpp"
-#include "tt_metal/impl/dispatch/worker_config_buffer.hpp"
-#include "tt_metal/impl/program/program.hpp"
-#include "tt_metal/impl/trace/trace_buffer.hpp"
+#include "env_lib.hpp"
+#include "command_queue_interface.hpp"
+#include "device_command.hpp"
+#include "lock_free_queue.hpp"
+#include "program_command_sequence.hpp"
+#include "worker_config_buffer.hpp"
+#include "program_impl.hpp"
+#include "trace_buffer.hpp"
 
 namespace tt::tt_metal {
 inline namespace v0 {
diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp
similarity index 99%
rename from tt_metal/impl/dispatch/command_queue_interface.hpp
rename to tt_metal/api/tt-metalium/command_queue_interface.hpp
index 438f7ee0c31..79739b976e0 100644
--- a/tt_metal/impl/dispatch/command_queue_interface.hpp
+++ b/tt_metal/api/tt-metalium/command_queue_interface.hpp
@@ -7,12 +7,12 @@
 #include <magic_enum/magic_enum.hpp>
 #include <mutex>
 
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
-#include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
-#include "tt_metal/impl/dispatch/memcpy.hpp"
-#include "tt_metal/llrt/hal.hpp"
-#include "tt_metal/impl/dispatch/util/include/dispatch_settings.hpp"
-#include "tt_metal/impl/dispatch/util/include/helpers.hpp"
+#include "cq_commands.hpp"
+#include "dispatch_core_manager.hpp"
+#include "memcpy.hpp"
+#include "hal.hpp"
+#include "dispatch_settings.hpp"
+#include "helpers.hpp"
 
 // FIXME: Don't do this in header files
 using namespace tt::tt_metal;
diff --git a/tt_metal/tools/profiler/common.hpp b/tt_metal/api/tt-metalium/common.hpp
similarity index 100%
rename from tt_metal/tools/profiler/common.hpp
rename to tt_metal/api/tt-metalium/common.hpp
diff --git a/tt_metal/detail/reports/compilation_reporter.hpp b/tt_metal/api/tt-metalium/compilation_reporter.hpp
similarity index 96%
rename from tt_metal/detail/reports/compilation_reporter.hpp
rename to tt_metal/api/tt-metalium/compilation_reporter.hpp
index 29c25988298..8beab945a74 100644
--- a/tt_metal/detail/reports/compilation_reporter.hpp
+++ b/tt_metal/api/tt-metalium/compilation_reporter.hpp
@@ -7,8 +7,8 @@
 #include <filesystem>
 #include <mutex>
 
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/program/program.hpp"
+#include "kernel.hpp"
+#include "program_impl.hpp"
 
 using std::mutex;
 using std::unique_lock;
diff --git a/tt_metal/tt_stl/concepts.hpp b/tt_metal/api/tt-metalium/concepts.hpp
similarity index 100%
rename from tt_metal/tt_stl/concepts.hpp
rename to tt_metal/api/tt-metalium/concepts.hpp
diff --git a/tt_metal/common/constants.hpp b/tt_metal/api/tt-metalium/constants.hpp
similarity index 100%
rename from tt_metal/common/constants.hpp
rename to tt_metal/api/tt-metalium/constants.hpp
diff --git a/tt_metal/common/core_coord.hpp b/tt_metal/api/tt-metalium/core_coord.hpp
similarity index 99%
rename from tt_metal/common/core_coord.hpp
rename to tt_metal/api/tt-metalium/core_coord.hpp
index c69c4ccd95d..f7889fcc746 100644
--- a/tt_metal/common/core_coord.hpp
+++ b/tt_metal/api/tt-metalium/core_coord.hpp
@@ -13,8 +13,8 @@
 
 #include <nlohmann/json.hpp>
 #include "umd/device/tt_xy_pair.h"
-#include "tt_metal/tt_stl/reflection.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "reflection.hpp"
+#include "span.hpp"
 
 using CoreCoord = tt_xy_pair;
 
diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/api/tt-metalium/core_descriptor.hpp
similarity index 97%
rename from tt_metal/common/core_descriptor.hpp
rename to tt_metal/api/tt-metalium/core_descriptor.hpp
index 501cd43480e..04863d2a211 100644
--- a/tt_metal/common/core_descriptor.hpp
+++ b/tt_metal/api/tt-metalium/core_descriptor.hpp
@@ -5,9 +5,9 @@
 #pragma once
 
 #include "core_coord.hpp"
-#include "tt_metal/llrt/tt_cluster.hpp"
-#include "llrt/hal.hpp"
-#include "tt_metal/impl/dispatch/dispatch_core_common.hpp"
+#include "tt_cluster.hpp"
+#include "hal.hpp"
+#include "dispatch_core_common.hpp"
 
 namespace tt {
 
diff --git a/tt_metal/impl/dispatch/cq_commands.hpp b/tt_metal/api/tt-metalium/cq_commands.hpp
similarity index 100%
rename from tt_metal/impl/dispatch/cq_commands.hpp
rename to tt_metal/api/tt-metalium/cq_commands.hpp
diff --git a/tt_metal/jit_build/data_format.hpp b/tt_metal/api/tt-metalium/data_format.hpp
similarity index 93%
rename from tt_metal/jit_build/data_format.hpp
rename to tt_metal/api/tt-metalium/data_format.hpp
index 6664e6dccd8..50b084bdb01 100644
--- a/tt_metal/jit_build/data_format.hpp
+++ b/tt_metal/api/tt-metalium/data_format.hpp
@@ -5,9 +5,9 @@
 #pragma once
 #include <cstdint>
 #include <vector>
-#include "common/tt_backend_api_types.hpp"              // for DataFormat
+#include "tt_backend_api_types.hpp"              // for DataFormat
 #include "umd/device/types/arch.h"                      // for ARCH
-#include "tt_metal/hw/inc/circular_buffer_constants.h"  // for NUM_CIRCULAR_BUFFERS
+#include "circular_buffer_constants.h"  // for NUM_CIRCULAR_BUFFERS
 
 enum class UnpackToDestMode : std::uint8_t;
 
diff --git a/tt_metal/impl/kernels/data_types.hpp b/tt_metal/api/tt-metalium/data_types.hpp
similarity index 100%
rename from tt_metal/impl/kernels/data_types.hpp
rename to tt_metal/api/tt-metalium/data_types.hpp
diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/api/tt-metalium/dataflow_api.h
similarity index 100%
rename from tt_metal/hw/inc/dataflow_api.h
rename to tt_metal/api/tt-metalium/dataflow_api.h
diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/api/tt-metalium/dev_msgs.h
similarity index 100%
rename from tt_metal/hw/inc/dev_msgs.h
rename to tt_metal/api/tt-metalium/dev_msgs.h
diff --git a/tt_metal/include/tt_metal/device.hpp b/tt_metal/api/tt-metalium/device.hpp
similarity index 95%
rename from tt_metal/include/tt_metal/device.hpp
rename to tt_metal/api/tt-metalium/device.hpp
index 7e97c132d71..1f4845efd97 100644
--- a/tt_metal/include/tt_metal/device.hpp
+++ b/tt_metal/api/tt-metalium/device.hpp
@@ -9,19 +9,19 @@
 #include <utility>
 
 #include "hostdevcommon/common_values.hpp"
-#include "impl/dispatch/work_executor.hpp"
-#include "tt_metal/impl/allocator/basic_allocator.hpp"
-#include "tt_metal/impl/allocator/l1_banking_allocator.hpp"
-#include "tt_metal/impl/kernels/data_types.hpp"
-#include "tt_metal/impl/program/program_device_map.hpp"
-#include "tt_metal/jit_build/build.hpp"
-#include "llrt/tt_cluster.hpp"
-#include "llrt/hal.hpp"
-#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
-#include "tt_metal/impl/sub_device/sub_device_manager.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
-#include "tt_metal/impl/device/program_cache.hpp"
+#include "work_executor.hpp"
+#include "basic_allocator.hpp"
+#include "l1_banking_allocator.hpp"
+#include "data_types.hpp"
+#include "program_device_map.hpp"
+#include "build.hpp"
+#include "tt_cluster.hpp"
+#include "hal.hpp"
+#include "command_queue_interface.hpp"
+#include "sub_device_manager.hpp"
+#include "sub_device_types.hpp"
+#include "span.hpp"
+#include "program_cache.hpp"
 
 namespace tt {
 
diff --git a/tt_metal/impl/dispatch/device_command.hpp b/tt_metal/api/tt-metalium/device_command.hpp
similarity index 99%
rename from tt_metal/impl/dispatch/device_command.hpp
rename to tt_metal/api/tt-metalium/device_command.hpp
index 41468b723d0..d3d1d3671a4 100644
--- a/tt_metal/impl/dispatch/device_command.hpp
+++ b/tt_metal/api/tt-metalium/device_command.hpp
@@ -8,12 +8,12 @@
 #include <cstddef>
 #include <vector>
 
-#include "common/env_lib.hpp"
-#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
-#include "tt_metal/impl/dispatch/memcpy.hpp"
-#include "tt_metal/tt_stl/aligned_allocator.hpp"
-#include "tt_metal/llrt/hal.hpp"
+#include "env_lib.hpp"
+#include "command_queue_interface.hpp"
+#include "cq_commands.hpp"
+#include "memcpy.hpp"
+#include "aligned_allocator.hpp"
+#include "hal.hpp"
 
 namespace tt::tt_metal {
 template <bool hugepage_write = false>
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/api/tt-metalium/device_impl.hpp
similarity index 96%
rename from tt_metal/impl/device/device.hpp
rename to tt_metal/api/tt-metalium/device_impl.hpp
index fa9aacfa652..4d58d5c8c06 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/api/tt-metalium/device_impl.hpp
@@ -8,20 +8,20 @@
 #include <mutex>
 #include <utility>
 
-#include "tt_metal/device.hpp"
+#include "device.hpp"
 #include "hostdevcommon/common_values.hpp"
-#include "impl/dispatch/work_executor.hpp"
-#include "tt_metal/impl/allocator/basic_allocator.hpp"
-#include "tt_metal/impl/allocator/l1_banking_allocator.hpp"
-#include "tt_metal/impl/kernels/data_types.hpp"
-#include "tt_metal/impl/program/program_device_map.hpp"
-#include "tt_metal/jit_build/build.hpp"
-#include "llrt/tt_cluster.hpp"
-#include "llrt/hal.hpp"
-#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
-#include "tt_metal/impl/sub_device/sub_device_manager_tracker.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "work_executor.hpp"
+#include "basic_allocator.hpp"
+#include "l1_banking_allocator.hpp"
+#include "data_types.hpp"
+#include "program_device_map.hpp"
+#include "build.hpp"
+#include "tt_cluster.hpp"
+#include "hal.hpp"
+#include "command_queue_interface.hpp"
+#include "sub_device_manager_tracker.hpp"
+#include "sub_device_types.hpp"
+#include "span.hpp"
 #include "program_cache.hpp"
 
 namespace tt::tt_metal {
diff --git a/tt_metal/impl/device/device_pool.hpp b/tt_metal/api/tt-metalium/device_pool.hpp
similarity index 96%
rename from tt_metal/impl/device/device_pool.hpp
rename to tt_metal/api/tt-metalium/device_pool.hpp
index 264636b1783..a072dcc0407 100644
--- a/tt_metal/impl/device/device_pool.hpp
+++ b/tt_metal/api/tt-metalium/device_pool.hpp
@@ -5,9 +5,9 @@
 #pragma once
 
 #include "umd/device/types/cluster_descriptor_types.h"
-#include "tt_metal/host_api.hpp"
-#include "impl/debug/dprint_server.hpp"
-#include "tt_metal/device.hpp"
+#include "host_api.hpp"
+#include "dprint_server.hpp"
+#include "device.hpp"
 namespace tt {
 namespace tt_metal::detail {
 
diff --git a/tt_metal/impl/dispatch/dispatch_constants.hpp b/tt_metal/api/tt-metalium/dispatch_constants.hpp
similarity index 96%
rename from tt_metal/impl/dispatch/dispatch_constants.hpp
rename to tt_metal/api/tt-metalium/dispatch_constants.hpp
index 3e314bef57f..390846432f3 100644
--- a/tt_metal/impl/dispatch/dispatch_constants.hpp
+++ b/tt_metal/api/tt-metalium/dispatch_constants.hpp
@@ -7,8 +7,8 @@
 #include <climits>
 #include <cstdint>
 #include <limits>
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
-#include "tt_metal/hw/inc/dev_msgs.h"
+#include "cq_commands.hpp"
+#include "dev_msgs.h"
 
 namespace tt::tt_metal::dispatch {
 
diff --git a/tt_metal/impl/dispatch/dispatch_core_common.hpp b/tt_metal/api/tt-metalium/dispatch_core_common.hpp
similarity index 92%
rename from tt_metal/impl/dispatch/dispatch_core_common.hpp
rename to tt_metal/api/tt-metalium/dispatch_core_common.hpp
index 35c94d6e775..16d2d027162 100644
--- a/tt_metal/impl/dispatch/dispatch_core_common.hpp
+++ b/tt_metal/api/tt-metalium/dispatch_core_common.hpp
@@ -4,11 +4,11 @@
 
 #pragma once
 
-#include "common/core_descriptor.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/kernels/data_types.hpp"
-#include "tt_metal/llrt/get_platform_architecture.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
+#include "core_descriptor.hpp"
+#include "core_coord.hpp"
+#include "data_types.hpp"
+#include "get_platform_architecture.hpp"
+#include "reflection.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/dispatch/dispatch_core_manager.hpp b/tt_metal/api/tt-metalium/dispatch_core_manager.hpp
similarity index 99%
rename from tt_metal/impl/dispatch/dispatch_core_manager.hpp
rename to tt_metal/api/tt-metalium/dispatch_core_manager.hpp
index fc77f4e908b..2d4b9b2ae84 100644
--- a/tt_metal/impl/dispatch/dispatch_core_manager.hpp
+++ b/tt_metal/api/tt-metalium/dispatch_core_manager.hpp
@@ -4,10 +4,10 @@
 
 #pragma once
 
-#include "common/core_descriptor.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include "core_descriptor.hpp"
+#include "core_coord.hpp"
 #include <list>
-#include "tt_metal/impl/dispatch/dispatch_core_common.hpp"
+#include "dispatch_core_common.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/dispatch/util/include/dispatch_settings.hpp b/tt_metal/api/tt-metalium/dispatch_settings.hpp
similarity index 98%
rename from tt_metal/impl/dispatch/util/include/dispatch_settings.hpp
rename to tt_metal/api/tt-metalium/dispatch_settings.hpp
index 95d8363aec6..282d922a40b 100644
--- a/tt_metal/impl/dispatch/util/include/dispatch_settings.hpp
+++ b/tt_metal/api/tt-metalium/dispatch_settings.hpp
@@ -6,10 +6,10 @@
 
 #include <cstdint>
 #include <unordered_map>
-#include "llrt/hal.hpp"
-#include "llrt/tt_cluster.hpp"
+#include "hal.hpp"
+#include "tt_cluster.hpp"
 #include "umd/device/tt_core_coordinates.h"
-#include "tt_metal/impl/dispatch/dispatch_constants.hpp"
+#include "dispatch_constants.hpp"
 
 namespace tt::tt_metal::dispatch {
 
diff --git a/tt_metal/impl/debug/dprint_server.hpp b/tt_metal/api/tt-metalium/dprint_server.hpp
similarity index 100%
rename from tt_metal/impl/debug/dprint_server.hpp
rename to tt_metal/api/tt-metalium/dprint_server.hpp
diff --git a/tt_metal/common/env_lib.hpp b/tt_metal/api/tt-metalium/env_lib.hpp
similarity index 100%
rename from tt_metal/common/env_lib.hpp
rename to tt_metal/api/tt-metalium/env_lib.hpp
diff --git a/tt_metal/impl/event/event.hpp b/tt_metal/api/tt-metalium/event.hpp
similarity index 100%
rename from tt_metal/impl/event/event.hpp
rename to tt_metal/api/tt-metalium/event.hpp
diff --git a/tt_metal/common/executor.hpp b/tt_metal/api/tt-metalium/executor.hpp
similarity index 97%
rename from tt_metal/common/executor.hpp
rename to tt_metal/api/tt-metalium/executor.hpp
index 59f256b1544..e791368aa31 100644
--- a/tt_metal/common/executor.hpp
+++ b/tt_metal/api/tt-metalium/executor.hpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "third_party/taskflow/taskflow/taskflow.hpp"
+#include <taskflow/taskflow.hpp>
 #include <thread>
 #include <stdexcept>
 
diff --git a/tt_metal/llrt/get_platform_architecture.hpp b/tt_metal/api/tt-metalium/get_platform_architecture.hpp
similarity index 94%
rename from tt_metal/llrt/get_platform_architecture.hpp
rename to tt_metal/api/tt-metalium/get_platform_architecture.hpp
index 3aa08fd4ee4..e60f52c5587 100644
--- a/tt_metal/llrt/get_platform_architecture.hpp
+++ b/tt_metal/api/tt-metalium/get_platform_architecture.hpp
@@ -6,8 +6,8 @@
 
 #include <cstdlib>
 
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/common/assert.hpp"
+#include "tt_backend_api_types.hpp"
+#include "assert.hpp"
 #include "umd/device/pci_device.hpp"
 #include "umd/device/tt_soc_descriptor.h"
 
@@ -37,7 +37,7 @@ namespace tt::tt_metal {
  *
  * Example usage:
  * @code
- * #include "tt_metal/common/tt_backend_api_types.hpp"
+ * #include "tt_backend_api_types.hpp"
  *
  * tt::ARCH arch = tt::tt_metal::get_platform_architecture();
  * if (arch == tt::ARCH::Invalid) {
diff --git a/tt_metal/include/tt_metal/global_circular_buffer.hpp b/tt_metal/api/tt-metalium/global_circular_buffer.hpp
similarity index 92%
rename from tt_metal/include/tt_metal/global_circular_buffer.hpp
rename to tt_metal/api/tt-metalium/global_circular_buffer.hpp
index 99fa0f55571..2acece3e4a5 100644
--- a/tt_metal/include/tt_metal/global_circular_buffer.hpp
+++ b/tt_metal/api/tt-metalium/global_circular_buffer.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/types.hpp"
+#include "global_circular_buffer_impl.hpp"
+#include "types.hpp"
 //==================================================
 //        GLOBAL CIRCULAR BUFFER FUNCTIONS
 //==================================================
diff --git a/tt_metal/impl/buffers/global_circular_buffer.hpp b/tt_metal/api/tt-metalium/global_circular_buffer_impl.hpp
similarity index 94%
rename from tt_metal/impl/buffers/global_circular_buffer.hpp
rename to tt_metal/api/tt-metalium/global_circular_buffer_impl.hpp
index 8235a5c30b9..56c7bfd6f7b 100644
--- a/tt_metal/impl/buffers/global_circular_buffer.hpp
+++ b/tt_metal/api/tt-metalium/global_circular_buffer_impl.hpp
@@ -7,9 +7,9 @@
 #include <cstdint>
 #include <memory>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/llrt/hal.hpp"
+#include "core_coord.hpp"
+#include "buffer_constants.hpp"
+#include "hal.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/buffers/global_semaphore.hpp b/tt_metal/api/tt-metalium/global_semaphore.hpp
similarity index 92%
rename from tt_metal/impl/buffers/global_semaphore.hpp
rename to tt_metal/api/tt-metalium/global_semaphore.hpp
index 0b6b3662e76..26b2f9d70f8 100644
--- a/tt_metal/impl/buffers/global_semaphore.hpp
+++ b/tt_metal/api/tt-metalium/global_semaphore.hpp
@@ -7,9 +7,9 @@
 #include <cstdint>
 #include <memory>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/llrt/hal.hpp"
+#include "core_coord.hpp"
+#include "buffer_constants.hpp"
+#include "hal.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/graph/graph_tracking.hpp b/tt_metal/api/tt-metalium/graph_tracking.hpp
similarity index 97%
rename from tt_metal/graph/graph_tracking.hpp
rename to tt_metal/api/tt-metalium/graph_tracking.hpp
index 40f14c36d59..7fb2e11ef6a 100644
--- a/tt_metal/graph/graph_tracking.hpp
+++ b/tt_metal/api/tt-metalium/graph_tracking.hpp
@@ -9,8 +9,8 @@
 #include <span>
 #include <string_view>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
+#include "core_coord.hpp"
+#include "buffer.hpp"
 
 namespace tt::tt_metal {
 inline namespace v0 {
diff --git a/tt_metal/llrt/hal.hpp b/tt_metal/api/tt-metalium/hal.hpp
similarity index 99%
rename from tt_metal/llrt/hal.hpp
rename to tt_metal/api/tt-metalium/hal.hpp
index b921e5df46c..3dafb8bcbf1 100644
--- a/tt_metal/llrt/hal.hpp
+++ b/tt_metal/api/tt-metalium/hal.hpp
@@ -14,8 +14,8 @@
 #include <variant>
 #include <vector>
 #include <memory>
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/utils.hpp"
+#include "assert.hpp"
+#include "utils.hpp"
 
 enum class CoreType;
 
@@ -55,6 +55,7 @@ enum class HalL1MemAddrType : uint8_t {
     TILE_HEADER_BUFFER,
     APP_ROUTING_INFO,
     RETRAIN_COUNT,
+    FABRIC_ROUTER_CONFIG,
     COUNT  // Keep this last so it always indicates number of enum options
 };
 
diff --git a/tt_metal/experimental/hal.hpp b/tt_metal/api/tt-metalium/hal_exp.hpp
similarity index 100%
rename from tt_metal/experimental/hal.hpp
rename to tt_metal/api/tt-metalium/hal_exp.hpp
diff --git a/tt_metal/impl/dispatch/util/include/helpers.hpp b/tt_metal/api/tt-metalium/helpers.hpp
similarity index 100%
rename from tt_metal/impl/dispatch/util/include/helpers.hpp
rename to tt_metal/api/tt-metalium/helpers.hpp
diff --git a/tt_metal/jit_build/hlk_desc.hpp b/tt_metal/api/tt-metalium/hlk_desc.hpp
similarity index 96%
rename from tt_metal/jit_build/hlk_desc.hpp
rename to tt_metal/api/tt-metalium/hlk_desc.hpp
index 1240b250e99..944f0c5e11b 100644
--- a/tt_metal/jit_build/hlk_desc.hpp
+++ b/tt_metal/api/tt-metalium/hlk_desc.hpp
@@ -7,11 +7,11 @@
 #include <string>
 
 #include "hostdevcommon/kernel_structs.h"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/base_types.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/common/utils.hpp"
-#include "tt_metal/hw/inc/circular_buffer_constants.h"  // for NUM_CIRCULAR_BUFFERS
+#include "assert.hpp"
+#include "base_types.hpp"
+#include "tt_backend_api_types.hpp"
+#include "utils.hpp"
+#include "circular_buffer_constants.h"  // for NUM_CIRCULAR_BUFFERS
 
 namespace tt {
 /**
diff --git a/tt_metal/host_api.hpp b/tt_metal/api/tt-metalium/host_api.hpp
similarity index 99%
rename from tt_metal/host_api.hpp
rename to tt_metal/api/tt-metalium/host_api.hpp
index 146f8ecc346..1a940b9053f 100644
--- a/tt_metal/host_api.hpp
+++ b/tt_metal/api/tt-metalium/host_api.hpp
@@ -7,11 +7,11 @@
 #include <variant>
 #include <vector>
 
-#include "tt_metal/impl/kernels/runtime_args_data.hpp"
-#include "tt_metal/impl/program/program.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "runtime_args_data.hpp"
+#include "program_impl.hpp"
+#include "device.hpp"
+#include "sub_device_types.hpp"
+#include "span.hpp"
 
 /** @file */
 
diff --git a/tt_metal/impl/kernels/kernel.hpp b/tt_metal/api/tt-metalium/kernel.hpp
similarity index 98%
rename from tt_metal/impl/kernels/kernel.hpp
rename to tt_metal/api/tt-metalium/kernel.hpp
index 72c5c14f18d..b419cde9698 100644
--- a/tt_metal/impl/kernels/kernel.hpp
+++ b/tt_metal/api/tt-metalium/kernel.hpp
@@ -10,11 +10,11 @@
 #include <type_traits>
 #include <memory>
 
-#include "jit_build/build.hpp"
-#include "common/base_types.hpp"
-#include "tt_metal/impl/kernels/kernel_types.hpp"
-#include "tt_metal/llrt/tt_memory.h"
-#include "tt_metal/tt_stl/span.hpp"
+#include "build.hpp"
+#include "base_types.hpp"
+#include "kernel_types.hpp"
+#include "tt_memory.h"
+#include "span.hpp"
 #include "runtime_args_data.hpp"
 
 namespace tt {
diff --git a/tt_metal/impl/kernels/kernel_types.hpp b/tt_metal/api/tt-metalium/kernel_types.hpp
similarity index 95%
rename from tt_metal/impl/kernels/kernel_types.hpp
rename to tt_metal/api/tt-metalium/kernel_types.hpp
index ed24d3d7bfe..4d1643fef7a 100644
--- a/tt_metal/impl/kernels/kernel_types.hpp
+++ b/tt_metal/api/tt-metalium/kernel_types.hpp
@@ -4,9 +4,9 @@
 
 #pragma once
 
-#include "common/base_types.hpp"
-#include "tt_metal/impl/kernels/data_types.hpp"
-#include "tt_metal/detail/util.hpp"
+#include "base_types.hpp"
+#include "data_types.hpp"
+#include "util.hpp"
 #include <map>
 #include <vector>
 #include <string>
diff --git a/tt_metal/impl/allocator/l1_banking_allocator.hpp b/tt_metal/api/tt-metalium/l1_banking_allocator.hpp
similarity index 96%
rename from tt_metal/impl/allocator/l1_banking_allocator.hpp
rename to tt_metal/api/tt-metalium/l1_banking_allocator.hpp
index 301cb3ee8aa..6f8ef808dde 100644
--- a/tt_metal/impl/allocator/l1_banking_allocator.hpp
+++ b/tt_metal/api/tt-metalium/l1_banking_allocator.hpp
@@ -6,7 +6,7 @@
 
 #include <cstdint>
 
-#include "tt_metal/impl/allocator/allocator.hpp"
+#include "allocator.hpp"
 
 namespace tt {
 
diff --git a/tt_metal/llrt/llrt.hpp b/tt_metal/api/tt-metalium/llrt.hpp
similarity index 98%
rename from tt_metal/llrt/llrt.hpp
rename to tt_metal/api/tt-metalium/llrt.hpp
index b6c676d3be8..e51f6d68ed1 100644
--- a/tt_metal/llrt/llrt.hpp
+++ b/tt_metal/api/tt-metalium/llrt.hpp
@@ -12,9 +12,9 @@
 #include <vector>
 
 // clang-format off
-#include "llrt/tt_cluster.hpp"
+#include "tt_cluster.hpp"
 #include "umd/device/tt_xy_pair.h"
-#include "llrt/tt_memory.h"
+#include "tt_memory.h"
 // clang-format on
 
 namespace tt {
diff --git a/tt_metal/impl/dispatch/lock_free_queue.hpp b/tt_metal/api/tt-metalium/lock_free_queue.hpp
similarity index 99%
rename from tt_metal/impl/dispatch/lock_free_queue.hpp
rename to tt_metal/api/tt-metalium/lock_free_queue.hpp
index 521e11f8492..792f35e6483 100644
--- a/tt_metal/impl/dispatch/lock_free_queue.hpp
+++ b/tt_metal/api/tt-metalium/lock_free_queue.hpp
@@ -7,7 +7,7 @@
 #include <atomic>
 #include <functional>
 #include <memory>
-#include "tt_metal/common/assert.hpp"
+#include "assert.hpp"
 
 /*
     Supports single writer, single reader
diff --git a/tt_metal/common/logger.hpp b/tt_metal/api/tt-metalium/logger.hpp
similarity index 100%
rename from tt_metal/common/logger.hpp
rename to tt_metal/api/tt-metalium/logger.hpp
diff --git a/tt_metal/common/math.hpp b/tt_metal/api/tt-metalium/math.hpp
similarity index 94%
rename from tt_metal/common/math.hpp
rename to tt_metal/api/tt-metalium/math.hpp
index 12e7c3d5182..2752ad1fa90 100644
--- a/tt_metal/common/math.hpp
+++ b/tt_metal/api/tt-metalium/math.hpp
@@ -8,7 +8,7 @@
 
 #include <cstdint>
 
-#include "tt_metal/common/assert.hpp"
+#include "assert.hpp"
 
 namespace tt {
 
diff --git a/tt_metal/impl/dispatch/memcpy.hpp b/tt_metal/api/tt-metalium/memcpy.hpp
similarity index 97%
rename from tt_metal/impl/dispatch/memcpy.hpp
rename to tt_metal/api/tt-metalium/memcpy.hpp
index d10a606b349..0905032697e 100644
--- a/tt_metal/impl/dispatch/memcpy.hpp
+++ b/tt_metal/api/tt-metalium/memcpy.hpp
@@ -6,8 +6,8 @@
 
 #include <cstdint>
 #include <emmintrin.h>
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/tt_stl/aligned_allocator.hpp"
+#include "assert.hpp"
+#include "aligned_allocator.hpp"
 #include "umd/device/device_api_metal.h"
 
 namespace tt::tt_metal {
diff --git a/tt_metal/detail/reports/memory_reporter.hpp b/tt_metal/api/tt-metalium/memory_reporter.hpp
similarity index 99%
rename from tt_metal/detail/reports/memory_reporter.hpp
rename to tt_metal/api/tt-metalium/memory_reporter.hpp
index d8321da2f26..da3f0c3543e 100644
--- a/tt_metal/detail/reports/memory_reporter.hpp
+++ b/tt_metal/api/tt-metalium/memory_reporter.hpp
@@ -11,7 +11,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "tt_metal/impl/allocator/allocator.hpp"
+#include "allocator.hpp"
 
 namespace tt::tt_metal {
 inline namespace v0 {
diff --git a/tt_metal/distributed/mesh_config.hpp b/tt_metal/api/tt-metalium/mesh_config.hpp
similarity index 100%
rename from tt_metal/distributed/mesh_config.hpp
rename to tt_metal/api/tt-metalium/mesh_config.hpp
diff --git a/tt_metal/distributed/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp
similarity index 98%
rename from tt_metal/distributed/mesh_device.hpp
rename to tt_metal/api/tt-metalium/mesh_device.hpp
index 780db8e61f0..cfe91dc9eed 100644
--- a/tt_metal/distributed/mesh_device.hpp
+++ b/tt_metal/api/tt-metalium/mesh_device.hpp
@@ -9,12 +9,12 @@
 #include <optional>
 #include <vector>
 
-#include "tt_metal/include/tt_metal/device.hpp"
+#include "device.hpp"
 
-#include "tt_metal/distributed/mesh_config.hpp"
-#include "tt_metal/distributed/mesh_device_view.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "mesh_config.hpp"
+#include "mesh_device_view.hpp"
+#include "sub_device_types.hpp"
+#include "span.hpp"
 
 namespace tt::tt_metal::distributed {
 
diff --git a/tt_metal/distributed/mesh_device_view.hpp b/tt_metal/api/tt-metalium/mesh_device_view.hpp
similarity index 98%
rename from tt_metal/distributed/mesh_device_view.hpp
rename to tt_metal/api/tt-metalium/mesh_device_view.hpp
index c4007dcfa8e..673b856d912 100644
--- a/tt_metal/distributed/mesh_device_view.hpp
+++ b/tt_metal/api/tt-metalium/mesh_device_view.hpp
@@ -11,8 +11,8 @@
 #include <optional>
 #include <functional>
 
-#include "tt_metal/device.hpp"
-#include "tt_metal/distributed/mesh_config.hpp"
+#include "device.hpp"
+#include "mesh_config.hpp"
 
 namespace tt::tt_metal::distributed {
 
diff --git a/tt_metal/common/metal_soc_descriptor.h b/tt_metal/api/tt-metalium/metal_soc_descriptor.h
similarity index 98%
rename from tt_metal/common/metal_soc_descriptor.h
rename to tt_metal/api/tt-metalium/metal_soc_descriptor.h
index cf897ba73b1..347893615aa 100644
--- a/tt_metal/common/metal_soc_descriptor.h
+++ b/tt_metal/api/tt-metalium/metal_soc_descriptor.h
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "common/tt_backend_api_types.hpp"
+#include "tt_backend_api_types.hpp"
 #include "core_coord.hpp"
 #include "umd/device/tt_soc_descriptor.h"
 #include "umd/device/tt_cluster_descriptor.h"
diff --git a/tt_metal/tt_stl/overloaded.hpp b/tt_metal/api/tt-metalium/overloaded.hpp
similarity index 100%
rename from tt_metal/tt_stl/overloaded.hpp
rename to tt_metal/api/tt-metalium/overloaded.hpp
diff --git a/tt_metal/detail/persistent_kernel_cache.hpp b/tt_metal/api/tt-metalium/persistent_kernel_cache.hpp
similarity index 100%
rename from tt_metal/detail/persistent_kernel_cache.hpp
rename to tt_metal/api/tt-metalium/persistent_kernel_cache.hpp
diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/api/tt-metalium/profiler.hpp
similarity index 93%
rename from tt_metal/tools/profiler/profiler.hpp
rename to tt_metal/api/tt-metalium/profiler.hpp
index 5212d394048..4ea3e1f657a 100644
--- a/tt_metal/tools/profiler/profiler.hpp
+++ b/tt_metal/api/tt-metalium/profiler.hpp
@@ -10,11 +10,11 @@
 #include <iostream>
 #include <filesystem>
 
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/program/program.hpp"
-#include "llrt/llrt.hpp"
-#include "tools/profiler/profiler_state.hpp"
-#include "tools/profiler/common.hpp"
+#include "buffer.hpp"
+#include "program_impl.hpp"
+#include "llrt.hpp"
+#include "profiler_state.hpp"
+#include "common.hpp"
 #include "tracy/TracyTTDevice.hpp"
 #include "common/TracyTTDeviceData.hpp"
 
diff --git a/tt_metal/tools/profiler/profiler_state.hpp b/tt_metal/api/tt-metalium/profiler_state.hpp
similarity index 100%
rename from tt_metal/tools/profiler/profiler_state.hpp
rename to tt_metal/api/tt-metalium/profiler_state.hpp
diff --git a/tt_metal/impl/device/program_cache.hpp b/tt_metal/api/tt-metalium/program_cache.hpp
similarity index 96%
rename from tt_metal/impl/device/program_cache.hpp
rename to tt_metal/api/tt-metalium/program_cache.hpp
index ae8ff794c83..ef45052c61c 100644
--- a/tt_metal/impl/device/program_cache.hpp
+++ b/tt_metal/api/tt-metalium/program_cache.hpp
@@ -6,8 +6,8 @@
 
 #include <unordered_map>
 
-#include "tt_metal/impl/program/program.hpp"
-#include "tt_metal/tt_stl/unique_any.hpp"
+#include "program_impl.hpp"
+#include "unique_any.hpp"
 
 namespace tt::tt_metal::program_cache::detail {
 
diff --git a/tt_metal/impl/dispatch/program_command_sequence.hpp b/tt_metal/api/tt-metalium/program_command_sequence.hpp
similarity index 95%
rename from tt_metal/impl/dispatch/program_command_sequence.hpp
rename to tt_metal/api/tt-metalium/program_command_sequence.hpp
index f2f5566682e..77dfc6e16d5 100644
--- a/tt_metal/impl/dispatch/program_command_sequence.hpp
+++ b/tt_metal/api/tt-metalium/program_command_sequence.hpp
@@ -7,7 +7,7 @@
 #include <memory>
 #include <vector>
 
-#include "tt_metal/impl/dispatch/device_command.hpp"
+#include "device_command.hpp"
 
 struct CQDispatchWritePackedCmd;
 struct launch_msg_t;
diff --git a/tt_metal/impl/program/program_device_map.hpp b/tt_metal/api/tt-metalium/program_device_map.hpp
similarity index 94%
rename from tt_metal/impl/program/program_device_map.hpp
rename to tt_metal/api/tt-metalium/program_device_map.hpp
index 66f679b62e9..257a91c8413 100644
--- a/tt_metal/impl/program/program_device_map.hpp
+++ b/tt_metal/api/tt-metalium/program_device_map.hpp
@@ -9,8 +9,8 @@
 #include <variant>
 #include <vector>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "core_coord.hpp"
+#include "tt_backend_api_types.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/api/tt-metalium/program_impl.hpp
similarity index 96%
rename from tt_metal/impl/program/program.hpp
rename to tt_metal/api/tt-metalium/program_impl.hpp
index e06423b7c74..7f6af7e4942 100644
--- a/tt_metal/impl/program/program.hpp
+++ b/tt_metal/api/tt-metalium/program_impl.hpp
@@ -7,12 +7,12 @@
 #include <memory>
 #include <optional>
 
-#include "tt_metal/impl/kernels/kernel_types.hpp"
-#include "tt_metal/impl/buffers/circular_buffer_types.hpp"
-#include "tt_metal/impl/buffers/semaphore.hpp"
-#include "tt_metal/impl/dispatch/program_command_sequence.hpp"
-#include "tt_metal/impl/program/program_device_map.hpp"
-#include "tt_metal/impl/dispatch/worker_config_buffer.hpp"
+#include "kernel_types.hpp"
+#include "circular_buffer_types.hpp"
+#include "semaphore.hpp"
+#include "program_command_sequence.hpp"
+#include "program_device_map.hpp"
+#include "worker_config_buffer.hpp"
 #include "dev_msgs.h"
 
 namespace tt {
diff --git a/tt_metal/tt_stl/reflection.hpp b/tt_metal/api/tt-metalium/reflection.hpp
similarity index 99%
rename from tt_metal/tt_stl/reflection.hpp
rename to tt_metal/api/tt-metalium/reflection.hpp
index 303326e2482..f5b2ca7081a 100644
--- a/tt_metal/tt_stl/reflection.hpp
+++ b/tt_metal/api/tt-metalium/reflection.hpp
@@ -26,7 +26,7 @@
 #include <nlohmann/json.hpp>
 #include <magic_enum/magic_enum.hpp>
 #include "type_name.hpp"
-#include "tt_metal/common/logger.hpp"
+#include "logger.hpp"
 
 namespace tt {
 namespace stl {
diff --git a/tt_metal/hw/inc/risc_attribs.h b/tt_metal/api/tt-metalium/risc_attribs.h
similarity index 100%
rename from tt_metal/hw/inc/risc_attribs.h
rename to tt_metal/api/tt-metalium/risc_attribs.h
diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/api/tt-metalium/rtoptions.hpp
similarity index 99%
rename from tt_metal/llrt/rtoptions.hpp
rename to tt_metal/api/tt-metalium/rtoptions.hpp
index a391d53d736..b9262a3e81b 100644
--- a/tt_metal/llrt/rtoptions.hpp
+++ b/tt_metal/api/tt-metalium/rtoptions.hpp
@@ -15,8 +15,8 @@
 #include <unordered_set>
 #include <vector>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
+#include "core_coord.hpp"
+#include "dispatch_core_manager.hpp"
 #include "umd/device/tt_soc_descriptor.h"  // For CoreType
 
 namespace tt {
diff --git a/tt_metal/impl/kernels/runtime_args_data.hpp b/tt_metal/api/tt-metalium/runtime_args_data.hpp
similarity index 100%
rename from tt_metal/impl/kernels/runtime_args_data.hpp
rename to tt_metal/api/tt-metalium/runtime_args_data.hpp
diff --git a/tt_metal/impl/buffers/semaphore.hpp b/tt_metal/api/tt-metalium/semaphore.hpp
similarity index 95%
rename from tt_metal/impl/buffers/semaphore.hpp
rename to tt_metal/api/tt-metalium/semaphore.hpp
index 8df46c00f05..3b3ccc84b21 100644
--- a/tt_metal/impl/buffers/semaphore.hpp
+++ b/tt_metal/api/tt-metalium/semaphore.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "llrt/hal.hpp"
-#include "common/core_coord.hpp"
+#include "hal.hpp"
+#include "core_coord.hpp"
 #include "umd/device/tt_soc_descriptor.h"
 
 namespace tt {
diff --git a/tt_metal/jit_build/settings.hpp b/tt_metal/api/tt-metalium/settings.hpp
similarity index 100%
rename from tt_metal/jit_build/settings.hpp
rename to tt_metal/api/tt-metalium/settings.hpp
diff --git a/tt_metal/tt_stl/slotmap.hpp b/tt_metal/api/tt-metalium/slotmap.hpp
similarity index 99%
rename from tt_metal/tt_stl/slotmap.hpp
rename to tt_metal/api/tt-metalium/slotmap.hpp
index 5925af181b0..61fcfb00cbc 100644
--- a/tt_metal/tt_stl/slotmap.hpp
+++ b/tt_metal/api/tt-metalium/slotmap.hpp
@@ -24,7 +24,7 @@
 #include <type_traits>
 #include <utility>
 
-#include <tt_metal/common/assert.hpp>
+#include "assert.hpp"
 
 namespace tt::stl {
 
diff --git a/tt_metal/tt_stl/span.hpp b/tt_metal/api/tt-metalium/span.hpp
similarity index 100%
rename from tt_metal/tt_stl/span.hpp
rename to tt_metal/api/tt-metalium/span.hpp
diff --git a/tt_metal/impl/sub_device/sub_device.hpp b/tt_metal/api/tt-metalium/sub_device.hpp
similarity index 91%
rename from tt_metal/impl/sub_device/sub_device.hpp
rename to tt_metal/api/tt-metalium/sub_device.hpp
index 8cc28276045..809a8d3e7a4 100644
--- a/tt_metal/impl/sub_device/sub_device.hpp
+++ b/tt_metal/api/tt-metalium/sub_device.hpp
@@ -7,9 +7,9 @@
 #include <array>
 #include <cstdint>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/llrt/hal.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "core_coord.hpp"
+#include "hal.hpp"
+#include "span.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/sub_device/sub_device_manager.hpp b/tt_metal/api/tt-metalium/sub_device_manager.hpp
similarity index 94%
rename from tt_metal/impl/sub_device/sub_device_manager.hpp
rename to tt_metal/api/tt-metalium/sub_device_manager.hpp
index 1cdb395cab3..b743da0aa99 100644
--- a/tt_metal/impl/sub_device/sub_device_manager.hpp
+++ b/tt_metal/api/tt-metalium/sub_device_manager.hpp
@@ -9,11 +9,11 @@
 #include <unordered_set>
 #include <vector>
 
-#include "tt_metal/impl/allocator/allocator.hpp"
-#include "tt_metal/impl/dispatch/memcpy.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "allocator.hpp"
+#include "memcpy.hpp"
+#include "sub_device.hpp"
+#include "sub_device_types.hpp"
+#include "span.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/sub_device/sub_device_manager_tracker.hpp b/tt_metal/api/tt-metalium/sub_device_manager_tracker.hpp
similarity index 94%
rename from tt_metal/impl/sub_device/sub_device_manager_tracker.hpp
rename to tt_metal/api/tt-metalium/sub_device_manager_tracker.hpp
index 9a489a49685..f7131a42ba8 100644
--- a/tt_metal/impl/sub_device/sub_device_manager_tracker.hpp
+++ b/tt_metal/api/tt-metalium/sub_device_manager_tracker.hpp
@@ -8,9 +8,9 @@
 #include <memory>
 #include <unordered_map>
 
-#include "tt_metal/impl/sub_device/sub_device.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include "sub_device.hpp"
+#include "sub_device_types.hpp"
+#include "span.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/sub_device/sub_device_types.hpp b/tt_metal/api/tt-metalium/sub_device_types.hpp
similarity index 100%
rename from tt_metal/impl/sub_device/sub_device_types.hpp
rename to tt_metal/api/tt-metalium/sub_device_types.hpp
diff --git a/tt_metal/distributed/system_mesh.hpp b/tt_metal/api/tt-metalium/system_mesh.hpp
similarity index 91%
rename from tt_metal/distributed/system_mesh.hpp
rename to tt_metal/api/tt-metalium/system_mesh.hpp
index 48c709392fe..841e95691d6 100644
--- a/tt_metal/distributed/system_mesh.hpp
+++ b/tt_metal/api/tt-metalium/system_mesh.hpp
@@ -7,9 +7,9 @@
 #include <memory>
 #include <vector>
 
-#include "tt_metal/distributed/mesh_config.hpp"
-#include "tt_metal/distributed/mesh_device.hpp"
-#include "tt_metal/include/tt_metal/device.hpp"
+#include "mesh_config.hpp"
+#include "mesh_device.hpp"
+#include "device.hpp"
 
 namespace tt::tt_metal::distributed {
 
diff --git a/tt_metal/common/test_common.hpp b/tt_metal/api/tt-metalium/test_common.hpp
similarity index 99%
rename from tt_metal/common/test_common.hpp
rename to tt_metal/api/tt-metalium/test_common.hpp
index 45e4a639c0d..2f6231a41ee 100644
--- a/tt_metal/common/test_common.hpp
+++ b/tt_metal/api/tt-metalium/test_common.hpp
@@ -15,9 +15,9 @@
 #include <vector>
 #include <iostream>
 #include <sstream>
-#include "common/metal_soc_descriptor.h"
+#include "metal_soc_descriptor.h"
 
-#include "tt_metal/common/base.hpp"
+#include "base.hpp"
 
 inline std::string get_soc_description_file(
     const tt::ARCH& arch, tt::TargetDevice target_device, std::string output_dir = "") {
diff --git a/tt_metal/common/test_tiles.hpp b/tt_metal/api/tt-metalium/test_tiles.hpp
similarity index 99%
rename from tt_metal/common/test_tiles.hpp
rename to tt_metal/api/tt-metalium/test_tiles.hpp
index e47fb65c85f..a7c70ebfdfb 100644
--- a/tt_metal/common/test_tiles.hpp
+++ b/tt_metal/api/tt-metalium/test_tiles.hpp
@@ -11,9 +11,9 @@
 #include <cstdint>
 #include <vector>
 #include <optional>
-#include "tt_metal/tt_stl/span.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/assert.hpp"
+#include "span.hpp"
+#include "constants.hpp"
+#include "assert.hpp"
 #include "tracy/Tracy.hpp"
 #include "math.hpp"
 
diff --git a/tt_metal/impl/tile/tile.hpp b/tt_metal/api/tt-metalium/tile.hpp
similarity index 96%
rename from tt_metal/impl/tile/tile.hpp
rename to tt_metal/api/tt-metalium/tile.hpp
index dacd12903f8..70d5ea8bb0a 100644
--- a/tt_metal/impl/tile/tile.hpp
+++ b/tt_metal/api/tt-metalium/tile.hpp
@@ -6,11 +6,11 @@
 
 #include <optional>
 
-#include "common/bfloat16.hpp"
-#include "common/tt_backend_api_types.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/llrt/hal.hpp"
+#include "bfloat16.hpp"
+#include "tt_backend_api_types.hpp"
+#include "constants.hpp"
+#include "math.hpp"
+#include "hal.hpp"
 
 namespace tt {
 
diff --git a/tt_metal/common/tilize_untilize.hpp b/tt_metal/api/tt-metalium/tilize_untilize.hpp
similarity index 100%
rename from tt_metal/common/tilize_untilize.hpp
rename to tt_metal/api/tt-metalium/tilize_untilize.hpp
diff --git a/tt_metal/impl/trace/trace.hpp b/tt_metal/api/tt-metalium/trace.hpp
similarity index 83%
rename from tt_metal/impl/trace/trace.hpp
rename to tt_metal/api/tt-metalium/trace.hpp
index 4efe652613d..daf32817626 100644
--- a/tt_metal/impl/trace/trace.hpp
+++ b/tt_metal/api/tt-metalium/trace.hpp
@@ -10,9 +10,9 @@
 #include <utility>
 #include <variant>
 
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/impl/trace/trace_buffer.hpp"
+#include "buffer.hpp"
+#include "command_queue.hpp"
+#include "trace_buffer.hpp"
 
 namespace tt::tt_metal {
 inline namespace v0 {
diff --git a/tt_metal/impl/trace/trace_buffer.hpp b/tt_metal/api/tt-metalium/trace_buffer.hpp
similarity index 92%
rename from tt_metal/impl/trace/trace_buffer.hpp
rename to tt_metal/api/tt-metalium/trace_buffer.hpp
index 675359d3e3e..70fa3d9d70f 100644
--- a/tt_metal/impl/trace/trace_buffer.hpp
+++ b/tt_metal/api/tt-metalium/trace_buffer.hpp
@@ -11,8 +11,8 @@
 #include <utility>
 #include <variant>
 
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
+#include "buffer.hpp"
+#include "sub_device_types.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/common/tt_backend_api_types.hpp b/tt_metal/api/tt-metalium/tt_backend_api_types.hpp
similarity index 100%
rename from tt_metal/common/tt_backend_api_types.hpp
rename to tt_metal/api/tt-metalium/tt_backend_api_types.hpp
diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/api/tt-metalium/tt_cluster.hpp
similarity index 98%
rename from tt_metal/llrt/tt_cluster.hpp
rename to tt_metal/api/tt-metalium/tt_cluster.hpp
index a1dd6d82d11..e4eb9051af0 100644
--- a/tt_metal/llrt/tt_cluster.hpp
+++ b/tt_metal/api/tt-metalium/tt_cluster.hpp
@@ -7,17 +7,17 @@
 #include <chrono>
 #include <functional>
 
-#include "common/base.hpp"
-#include "common/metal_soc_descriptor.h"
-#include "common/test_common.hpp"
-#include "common/tt_backend_api_types.hpp"
+#include "base.hpp"
+#include "metal_soc_descriptor.h"
+#include "test_common.hpp"
+#include "tt_backend_api_types.hpp"
 #include "umd/device/device_api_metal.h"
 #include "umd/device/tt_cluster_descriptor.h"
 #include "umd/device/tt_xy_pair.h"
 
 #include "dev_msgs.h"
 
-#include "llrt/hal.hpp"
+#include "hal.hpp"
 
 static constexpr std::uint32_t SW_VERSION = 0x00020000;
 
diff --git a/tt_metal/hw/inc/tt_log.h b/tt_metal/api/tt-metalium/tt_log.h
similarity index 100%
rename from tt_metal/hw/inc/tt_log.h
rename to tt_metal/api/tt-metalium/tt_log.h
diff --git a/tt_metal/llrt/tt_memory.h b/tt_metal/api/tt-metalium/tt_memory.h
similarity index 100%
rename from tt_metal/llrt/tt_memory.h
rename to tt_metal/api/tt-metalium/tt_memory.h
diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/api/tt-metalium/tt_metal.hpp
similarity index 99%
rename from tt_metal/detail/tt_metal.hpp
rename to tt_metal/api/tt-metalium/tt_metal.hpp
index 512998116b4..3bb61eda3a7 100644
--- a/tt_metal/detail/tt_metal.hpp
+++ b/tt_metal/api/tt-metalium/tt_metal.hpp
@@ -9,9 +9,9 @@
 #include "umd/device/types/cluster_descriptor_types.h"
 #include "umd/device/tt_soc_descriptor.h"
 #include "hostdevcommon/common_values.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
+#include "core_coord.hpp"
+#include "dispatch_core_manager.hpp"
+#include "buffer.hpp"
 
 namespace tt::tt_metal {
 inline namespace v0 {
diff --git a/tt_metal/tt_stl/type_name.hpp b/tt_metal/api/tt-metalium/type_name.hpp
similarity index 100%
rename from tt_metal/tt_stl/type_name.hpp
rename to tt_metal/api/tt-metalium/type_name.hpp
diff --git a/tt_metal/include/tt_metal/types.hpp b/tt_metal/api/tt-metalium/types.hpp
similarity index 98%
rename from tt_metal/include/tt_metal/types.hpp
rename to tt_metal/api/tt-metalium/types.hpp
index 532cf5ac687..e83ed5ebd08 100644
--- a/tt_metal/include/tt_metal/types.hpp
+++ b/tt_metal/api/tt-metalium/types.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/host_api.hpp"
+#include "host_api.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/tt_stl/unique_any.hpp b/tt_metal/api/tt-metalium/unique_any.hpp
similarity index 98%
rename from tt_metal/tt_stl/unique_any.hpp
rename to tt_metal/api/tt-metalium/unique_any.hpp
index bdbc0e29ff1..01854f0fdbb 100644
--- a/tt_metal/tt_stl/unique_any.hpp
+++ b/tt_metal/api/tt-metalium/unique_any.hpp
@@ -7,7 +7,7 @@
 #include <any>
 #include <array>
 
-#include "tt_metal/tt_stl/concepts.hpp"
+#include "concepts.hpp"
 
 namespace tt::stl {
 
diff --git a/tt_metal/detail/util.hpp b/tt_metal/api/tt-metalium/util.hpp
similarity index 93%
rename from tt_metal/detail/util.hpp
rename to tt_metal/api/tt-metalium/util.hpp
index ad2d86dff56..c79055479aa 100644
--- a/tt_metal/detail/util.hpp
+++ b/tt_metal/api/tt-metalium/util.hpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "math.hpp"
+#include "tt_backend_api_types.hpp"
 #include "hostdevcommon/common_values.hpp"
-#include "tt_metal/impl/kernels/data_types.hpp"
-#include "llrt/hal.hpp"
+#include "data_types.hpp"
+#include "hal.hpp"
 #include "umd/device/tt_soc_descriptor.h"
 
 namespace tt::tt_metal::detail {
diff --git a/tt_metal/common/utils.hpp b/tt_metal/api/tt-metalium/utils.hpp
similarity index 100%
rename from tt_metal/common/utils.hpp
rename to tt_metal/api/tt-metalium/utils.hpp
diff --git a/tt_metal/impl/dispatch/work_executor.hpp b/tt_metal/api/tt-metalium/work_executor.hpp
similarity index 99%
rename from tt_metal/impl/dispatch/work_executor.hpp
rename to tt_metal/api/tt-metalium/work_executor.hpp
index fdc16a9dc21..a000a6e0334 100644
--- a/tt_metal/impl/dispatch/work_executor.hpp
+++ b/tt_metal/api/tt-metalium/work_executor.hpp
@@ -13,7 +13,7 @@
 #include <functional>
 #include <thread>
 
-#include "common/env_lib.hpp"
+#include "env_lib.hpp"
 #include "lock_free_queue.hpp"
 #include "tracy/Tracy.hpp"
 
diff --git a/tt_metal/common/work_split.hpp b/tt_metal/api/tt-metalium/work_split.hpp
similarity index 98%
rename from tt_metal/common/work_split.hpp
rename to tt_metal/api/tt-metalium/work_split.hpp
index f024f016e65..3bc88fc35f4 100644
--- a/tt_metal/common/work_split.hpp
+++ b/tt_metal/api/tt-metalium/work_split.hpp
@@ -12,7 +12,7 @@
 #include <tuple>
 #include <vector>
 
-#include "tt_metal/common/core_coord.hpp"
+#include "core_coord.hpp"
 
 namespace tt {
 namespace tt_metal {
diff --git a/tt_metal/impl/dispatch/worker_config_buffer.hpp b/tt_metal/api/tt-metalium/worker_config_buffer.hpp
similarity index 98%
rename from tt_metal/impl/dispatch/worker_config_buffer.hpp
rename to tt_metal/api/tt-metalium/worker_config_buffer.hpp
index 5d9a4f186c7..8364a4f5c0e 100644
--- a/tt_metal/impl/dispatch/worker_config_buffer.hpp
+++ b/tt_metal/api/tt-metalium/worker_config_buffer.hpp
@@ -6,7 +6,7 @@
 
 #include <cstdint>
 #include <vector>
-#include "llrt/hal.hpp"
+#include "hal.hpp"
 
 namespace tt {
 
diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt
index 463cec72a17..20f5e0f84e2 100644
--- a/tt_metal/common/CMakeLists.txt
+++ b/tt_metal/common/CMakeLists.txt
@@ -8,7 +8,7 @@ set(COMMON_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/work_split.cpp
 )
 
-add_library(common STATIC ${COMMON_SRCS}) # FIXME(14541): Should be OBJECT, but can't handle circular deps between Object libs
+add_library(common OBJECT ${COMMON_SRCS})
 add_library(Metalium::Metal::Common ALIAS common)
 
 target_link_libraries(common PRIVATE yaml-cpp::yaml-cpp)
@@ -20,10 +20,11 @@ target_link_libraries(
         fmt::fmt-header-only
         span
         Metalium::Metal::STL
-        Metalium::Metal::LLRT
         umd::Firmware
+        umd::device
     PRIVATE
         Tracy::TracyClient
+        TT::Metalium::HostDevCommon
 )
 
 target_include_directories(
diff --git a/tt_metal/common/core_assignment.hpp b/tt_metal/common/core_assignment.hpp
index d10bcdd3a10..311a351d564 100644
--- a/tt_metal/common/core_assignment.hpp
+++ b/tt_metal/common/core_assignment.hpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "core_coord.hpp"
-#include "tt_metal/llrt/tt_cluster.hpp"
+#include <tt_cluster.hpp>
 
 namespace tt {
 namespace tt_metal {
diff --git a/tt_metal/common/core_coord.cpp b/tt_metal/common/core_coord.cpp
index d727c1e3295..76caf7ee641 100644
--- a/tt_metal/common/core_coord.cpp
+++ b/tt_metal/common/core_coord.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/core_coord.hpp"
+#include <core_coord.hpp>
 
 #include <algorithm>
 #include <cstdint>
@@ -14,10 +14,10 @@
 #include <vector>
 
 #include "umd/device/tt_xy_pair.h"
-#include "tt_metal/common/assert.hpp"
+#include <assert.hpp>
 #include "tracy/Tracy.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include <reflection.hpp>
+#include <span.hpp>
 
 auto fmt::formatter<CoreCoord>::format(const CoreCoord& core_coord, format_context& ctx) const
     -> format_context::iterator {
diff --git a/tt_metal/common/metal_soc_descriptor.cpp b/tt_metal/common/metal_soc_descriptor.cpp
index 8c3b38f9033..11f639b5f95 100644
--- a/tt_metal/common/metal_soc_descriptor.cpp
+++ b/tt_metal/common/metal_soc_descriptor.cpp
@@ -8,7 +8,7 @@
 #include <iostream>
 #include <string>
 
-#include "tt_metal/common/assert.hpp"
+#include <assert.hpp>
 #include "umd/device/cluster.h"
 #include "yaml-cpp/yaml.h"
 
diff --git a/tt_metal/common/utils.cpp b/tt_metal/common/utils.cpp
index 06bef53c332..1744c2dbc84 100644
--- a/tt_metal/common/utils.cpp
+++ b/tt_metal/common/utils.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/utils.hpp"
+#include <utils.hpp>
 #include <mutex>
 #include "tracy/Tracy.hpp"
-#include "llrt/rtoptions.hpp"
+#include <rtoptions.hpp>
 
 #include <filesystem>
 namespace fs = std::filesystem;
diff --git a/tt_metal/common/work_split.cpp b/tt_metal/common/work_split.cpp
index 4ce3eb346e3..a4fb4601d03 100644
--- a/tt_metal/common/work_split.cpp
+++ b/tt_metal/common/work_split.cpp
@@ -10,9 +10,9 @@
 #include <tuple>
 #include <vector>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/common/math.hpp"
+#include <assert.hpp>
+#include <core_coord.hpp>
+#include <math.hpp>
 #include "tracy/Tracy.hpp"
 
 namespace tt {
diff --git a/tt_metal/detail/CMakeLists.txt b/tt_metal/detail/CMakeLists.txt
index 0df4a83d483..aa1cd4dec3c 100644
--- a/tt_metal/detail/CMakeLists.txt
+++ b/tt_metal/detail/CMakeLists.txt
@@ -4,4 +4,11 @@ set(DETAIL_SRC
 )
 
 add_library(detail OBJECT ${DETAIL_SRC})
-target_link_libraries(detail PUBLIC common PRIVATE Metalium::Metal::Impl)
+target_link_libraries(
+    detail
+    PUBLIC
+        common
+    PRIVATE
+        Metalium::Metal::Impl
+        TT::Metalium::HostDevCommon
+)
diff --git a/tt_metal/detail/reports/compilation_reporter.cpp b/tt_metal/detail/reports/compilation_reporter.cpp
index 5dcebd2fbb3..bd0c3f0fa71 100644
--- a/tt_metal/detail/reports/compilation_reporter.cpp
+++ b/tt_metal/detail/reports/compilation_reporter.cpp
@@ -4,7 +4,7 @@
 
 #include <atomic>
 #include <filesystem>
-#include "tt_metal/detail/reports/compilation_reporter.hpp"
+#include <compilation_reporter.hpp>
 #include "tt_metal/detail/reports/report_utils.hpp"
 
 namespace fs = std::filesystem;
diff --git a/tt_metal/detail/reports/memory_reporter.cpp b/tt_metal/detail/reports/memory_reporter.cpp
index 34068a4cc70..c2e31bde32b 100644
--- a/tt_metal/detail/reports/memory_reporter.cpp
+++ b/tt_metal/detail/reports/memory_reporter.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/detail/reports/memory_reporter.hpp"
+#include <memory_reporter.hpp>
 #include "tt_metal/detail/reports/report_utils.hpp"
-#include "tt_metal/impl/allocator/allocator.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/program/program.hpp"
+#include <allocator.hpp>
+#include <device.hpp>
+#include <program_impl.hpp>
 
 #include <algorithm>
 #include <filesystem>
diff --git a/tt_metal/detail/reports/report_utils.hpp b/tt_metal/detail/reports/report_utils.hpp
index c927fbcc1c8..8ce31463da1 100644
--- a/tt_metal/detail/reports/report_utils.hpp
+++ b/tt_metal/detail/reports/report_utils.hpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/utils.hpp"
+#include <utils.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/distributed/CMakeLists.txt b/tt_metal/distributed/CMakeLists.txt
index 3dcd903ab15..f8e32abe784 100644
--- a/tt_metal/distributed/CMakeLists.txt
+++ b/tt_metal/distributed/CMakeLists.txt
@@ -10,4 +10,11 @@ set(DISTRIBUTED_SRC
 )
 
 add_library(distributed OBJECT ${DISTRIBUTED_SRC})
-target_link_libraries(distributed PUBLIC common PRIVATE Metalium::Metal::Impl)
+target_link_libraries(
+    distributed
+    PUBLIC
+        common
+    PRIVATE
+        Metalium::Metal::Impl
+        TT::Metalium::HostDevCommon
+)
diff --git a/tt_metal/distributed/coordinate_translation.cpp b/tt_metal/distributed/coordinate_translation.cpp
index 4870acf0e31..5e4be86b0b8 100644
--- a/tt_metal/distributed/coordinate_translation.cpp
+++ b/tt_metal/distributed/coordinate_translation.cpp
@@ -66,7 +66,7 @@ MeshShape get_system_mesh_shape(size_t system_num_devices) {
 
 std::pair<CoordinateTranslationMap, MeshShape> get_system_mesh_coordinate_translation_map() {
     static const auto* cached_translation_map = new std::pair<CoordinateTranslationMap, MeshShape>([] {
-        auto system_num_devices = tt::Cluster::instance().number_of_devices();
+        auto system_num_devices = tt::Cluster::instance().number_of_user_devices();
 
         std::string galaxy_mesh_descriptor = "TG.json";
         if (tt::Cluster::instance().number_of_pci_devices() == system_num_devices) {
diff --git a/tt_metal/distributed/coordinate_translation.hpp b/tt_metal/distributed/coordinate_translation.hpp
index b9d13327b4d..b4fc5c21b85 100644
--- a/tt_metal/distributed/coordinate_translation.hpp
+++ b/tt_metal/distributed/coordinate_translation.hpp
@@ -7,7 +7,7 @@
 #include <unordered_map>
 
 #include "umd/device/types/cluster_descriptor_types.h"
-#include "tt_metal/distributed/mesh_device_view.hpp"
+#include <mesh_device_view.hpp>
 
 namespace tt::tt_metal::distributed {
 
diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp
index 5f6bc01294e..1886dd870cd 100644
--- a/tt_metal/distributed/mesh_device.cpp
+++ b/tt_metal/distributed/mesh_device.cpp
@@ -2,21 +2,21 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/distributed/mesh_device.hpp"
+#include <mesh_device.hpp>
 
 #include <cstddef>
 #include <memory>
 #include <unordered_map>
 #include <utility>
 
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/distributed/system_mesh.hpp"
-#include "tt_metal/distributed/mesh_device_view.hpp"
+#include <logger.hpp>
+#include <host_api.hpp>
+#include <tt_metal.hpp>
+#include <system_mesh.hpp>
+#include <mesh_device_view.hpp>
 #include "tt_metal/distributed/mesh_command_queue.hpp"
 
-#include "tt_metal/llrt/hal.hpp"
+#include <hal.hpp>
 
 namespace tt::tt_metal::distributed {
 
diff --git a/tt_metal/distributed/mesh_device_view.cpp b/tt_metal/distributed/mesh_device_view.cpp
index 1b7216bfc34..28957d2dab3 100644
--- a/tt_metal/distributed/mesh_device_view.cpp
+++ b/tt_metal/distributed/mesh_device_view.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/distributed/mesh_device_view.hpp"
+#include <mesh_device_view.hpp>
 
 #include <algorithm>
 #include <stdexcept>
 
-#include "tt_metal/distributed/mesh_device.hpp"
+#include <mesh_device.hpp>
 
 namespace tt::tt_metal::distributed {
 
diff --git a/tt_metal/distributed/mesh_workload.cpp b/tt_metal/distributed/mesh_workload.cpp
index 1464ed29d45..f644d80ef00 100644
--- a/tt_metal/distributed/mesh_workload.cpp
+++ b/tt_metal/distributed/mesh_workload.cpp
@@ -5,7 +5,7 @@
 #include "mesh_command_queue.hpp"
 #include "mesh_workload.hpp"
 #include "mesh_workload_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt_metal.hpp>
 
 namespace tt::tt_metal::distributed {
 
diff --git a/tt_metal/distributed/mesh_workload.hpp b/tt_metal/distributed/mesh_workload.hpp
index f39c3f6c47c..48e7173a30d 100644
--- a/tt_metal/distributed/mesh_workload.hpp
+++ b/tt_metal/distributed/mesh_workload.hpp
@@ -6,7 +6,7 @@
 
 #include "mesh_device.hpp"
 #include "tt_metal/impl/program/dispatch.hpp"
-#include "tt_metal/host_api.hpp"
+#include <host_api.hpp>
 
 namespace tt::tt_metal::distributed {
 // The LogicalDeviceRange concept is fundamentally identical to the CoreRange concept
diff --git a/tt_metal/distributed/mesh_workload_utils.cpp b/tt_metal/distributed/mesh_workload_utils.cpp
index b0990fa55f8..7d9d5a1c897 100644
--- a/tt_metal/distributed/mesh_workload_utils.cpp
+++ b/tt_metal/distributed/mesh_workload_utils.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <host_api.hpp>
+#include <command_queue.hpp>
 #include "tt_metal/impl/program/dispatch.hpp"
 
 namespace tt::tt_metal::distributed {
diff --git a/tt_metal/distributed/mesh_workload_utils.hpp b/tt_metal/distributed/mesh_workload_utils.hpp
index 5da77fc33e1..e6b0429dd54 100644
--- a/tt_metal/distributed/mesh_workload_utils.hpp
+++ b/tt_metal/distributed/mesh_workload_utils.hpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
+#include <host_api.hpp>
 
 namespace tt::tt_metal::distributed {
 
diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp
index f1eaf92adb4..534481c96e7 100644
--- a/tt_metal/distributed/system_mesh.cpp
+++ b/tt_metal/distributed/system_mesh.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/distributed/system_mesh.hpp"
+#include <system_mesh.hpp>
 
 #include "umd/device/types/cluster_descriptor_types.h"
 #include "tt_metal/distributed/coordinate_translation.hpp"
@@ -95,9 +95,11 @@ std::vector<chip_id_t> SystemMesh::Impl::get_mapped_physical_device_ids(const Me
     // First check if total size fits
     TT_FATAL(
         requested_num_rows * requested_num_cols <= system_mesh_rows * system_mesh_cols,
-        "Requested submesh is too big: {}x{}",
+        "Requested submesh is too big: {}x{}, SystemMesh shape: {}x{}",
         requested_num_rows,
-        requested_num_cols);
+        requested_num_cols,
+        system_mesh_rows,
+        system_mesh_cols);
 
     bool is_single_row_or_column = requested_num_rows == 1 or requested_num_cols == 1;
     if (is_single_row_or_column) {
diff --git a/tt_metal/experimental/hal.cpp b/tt_metal/experimental/hal.cpp
index 007a1c1c682..c80cbaa146f 100644
--- a/tt_metal/experimental/hal.cpp
+++ b/tt_metal/experimental/hal.cpp
@@ -4,8 +4,8 @@
 
 #include <cstdint>
 
-#include "tt_metal/experimental/hal.hpp"
-#include "tt_metal/llrt/hal.hpp"
+#include <hal_exp.hpp>
+#include <hal.hpp>
 #include <umd/device/types/arch.h>
 
 using tt::tt_metal::HalL1MemAddrType;
diff --git a/tt_metal/graph/graph_tracking.cpp b/tt_metal/graph/graph_tracking.cpp
index 434bb9b8af0..eae62f29ace 100644
--- a/tt_metal/graph/graph_tracking.cpp
+++ b/tt_metal/graph/graph_tracking.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/graph/graph_tracking.hpp"
+#include <graph_tracking.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/hostdevcommon/api/hostdevcommon/dprint_common.h b/tt_metal/hostdevcommon/api/hostdevcommon/dprint_common.h
index e2a32e7ceb5..8c44dd84f85 100644
--- a/tt_metal/hostdevcommon/api/hostdevcommon/dprint_common.h
+++ b/tt_metal/hostdevcommon/api/hostdevcommon/dprint_common.h
@@ -14,7 +14,7 @@
 // But wait there's more, SW also includes tensix_types.h so there's both tt::DataFormat and DataFormat there. Use a
 // different name here so that this header can be included in both.
 #if !defined(KERNEL_BUILD) && !defined(FW_BUILD)  // SW
-#include "common/tt_backend_api_types.hpp"
+#include <tt-metalium/tt_backend_api_types.hpp>
 typedef tt::DataFormat CommonDataFormat;
 #else  // HW already includes tensix_types.h
 #include "core_config.h"
diff --git a/tt_metal/hw/CMakeLists.txt b/tt_metal/hw/CMakeLists.txt
index 5307625e33c..5b6ed87f5c2 100644
--- a/tt_metal/hw/CMakeLists.txt
+++ b/tt_metal/hw/CMakeLists.txt
@@ -151,6 +151,8 @@ foreach(ARCH IN LISTS ARCHS)
         -I..
         -I${PROJECT_SOURCE_DIR}
         -I${PROJECT_SOURCE_DIR}/tt_metal
+        -I${PROJECT_SOURCE_DIR}/tt_metal/api
+        -I${PROJECT_SOURCE_DIR}/tt_metal/api/tt-metalium
         -I${PROJECT_SOURCE_DIR}/tt_metal/include
         -I${PROJECT_SOURCE_DIR}/tt_metal/hw/inc
         -I${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/debug
diff --git a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
index f716a512276..6f2cea1128d 100644
--- a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
+++ b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
@@ -7,7 +7,7 @@
 #include <stdint.h>
 
 #include "noc_parameters.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc_overlay_parameters.h"
 
 // Helper functions to convert NoC coordinates to NoC-0 coordinates, used in metal as "physical" coordinates.
diff --git a/tt_metal/hw/inc/debug/ring_buffer.h b/tt_metal/hw/inc/debug/ring_buffer.h
index 9190737caf8..418de7c95ee 100644
--- a/tt_metal/hw/inc/debug/ring_buffer.h
+++ b/tt_metal/hw/inc/debug/ring_buffer.h
@@ -11,7 +11,7 @@ constexpr static int16_t DEBUG_RING_BUFFER_STARTING_INDEX = -1;
 
 #if defined(KERNEL_BUILD) || defined(FW_BUILD)
 
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 
 #if defined(WATCHER_ENABLED) && !defined(WATCHER_DISABLE_RING_BUFFER) && !defined(FORCE_WATCHER_OFF)
 
diff --git a/tt_metal/hw/inc/debug/sanitize_noc.h b/tt_metal/hw/inc/debug/sanitize_noc.h
index eae999aded0..de8c138f194 100644
--- a/tt_metal/hw/inc/debug/sanitize_noc.h
+++ b/tt_metal/hw/inc/debug/sanitize_noc.h
@@ -25,7 +25,7 @@
 #include "watcher_common.h"
 
 #include "dataflow_cmd_bufs.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc_overlay_parameters.h"
 #include "noc_parameters.h"
 #include "noc_nonblocking_api.h"
diff --git a/tt_metal/hw/inc/debug/watcher_common.h b/tt_metal/hw/inc/debug/watcher_common.h
index 87690f6038b..05f35f0faa2 100644
--- a/tt_metal/hw/inc/debug/watcher_common.h
+++ b/tt_metal/hw/inc/debug/watcher_common.h
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 
 #if defined(WATCHER_ENABLED)
 
diff --git a/tt_metal/hw/inc/debug/waypoint.h b/tt_metal/hw/inc/debug/waypoint.h
index b34cb6efe2a..19cf2848057 100644
--- a/tt_metal/hw/inc/debug/waypoint.h
+++ b/tt_metal/hw/inc/debug/waypoint.h
@@ -15,7 +15,7 @@
 
 #include <utility>
 
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 
 #if defined(WATCHER_ENABLED) && !defined(WATCHER_DISABLE_WAYPOINT) && !defined(FORCE_WATCHER_OFF)
 #include <cstddef>
diff --git a/tt_metal/hw/inc/ethernet/dataflow_api.h b/tt_metal/hw/inc/ethernet/dataflow_api.h
index 160a5c7f863..507fc3b3142 100644
--- a/tt_metal/hw/inc/ethernet/dataflow_api.h
+++ b/tt_metal/hw/inc/ethernet/dataflow_api.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "risc_common.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "eth_l1_address_map.h"
 #include "risc_common.h"
 #include "tt_eth_api.h"
@@ -13,7 +13,7 @@
 
 #include "tools/profiler/kernel_profiler.hpp"
 #include "noc_nonblocking_api.h"
-#include "../dataflow_api.h"
+#include <tt-metalium/dataflow_api.h>
 #include "tunneling.h"
 
 /**
diff --git a/tt_metal/hw/inc/firmware_common.h b/tt_metal/hw/inc/firmware_common.h
index 9f051b32abb..ce4fd71a6d1 100644
--- a/tt_metal/hw/inc/firmware_common.h
+++ b/tt_metal/hw/inc/firmware_common.h
@@ -12,7 +12,7 @@
 #include "compile_time_args.h"
 #include "dev_mem_map.h"
 #include "hostdevcommon/kernel_structs.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc/noc_parameters.h"
 #include "debug/dprint.h"
 
diff --git a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h
index 37e8ec771d2..1970be76c0a 100644
--- a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h
+++ b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h
@@ -7,7 +7,7 @@
 #include <stdint.h>
 
 #include "noc_parameters.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc_overlay_parameters.h"
 
 // Helper functions to convert NoC coordinates to NoC-0 coordinates, used in metal as "physical" coordinates.
diff --git a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h
index e06317a47b2..58eb1be5f66 100644
--- a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h
+++ b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h
@@ -7,7 +7,7 @@
 #include <stdint.h>
 
 #include "noc_parameters.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc_overlay_parameters.h"
 
 // Helper functions to convert NoC coordinates to NoC-0 coordinates, used in metal as "physical" coordinates.
diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt
index db16e2a28e4..02cc4c6d3d1 100644
--- a/tt_metal/impl/CMakeLists.txt
+++ b/tt_metal/impl/CMakeLists.txt
@@ -49,11 +49,13 @@ target_link_libraries(
     impl
     PUBLIC
         common
-        Metalium::Metal::LLRT
         Tracy::TracyClient
+        Taskflow::Taskflow
     PRIVATE
         Boost::smart_ptr
         range-v3::range-v3
+        TT::Metalium::HostDevCommon
+        Metalium::Metal::Hardware
 )
 
 target_include_directories(
diff --git a/tt_metal/impl/allocator/algorithms/free_list.cpp b/tt_metal/impl/allocator/algorithms/free_list.cpp
index b4265844682..023d570a27a 100644
--- a/tt_metal/impl/allocator/algorithms/free_list.cpp
+++ b/tt_metal/impl/allocator/algorithms/free_list.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "tt_metal/impl/allocator/algorithms/free_list.hpp"
-#include "tt_metal/common/assert.hpp"
+#include <assert.hpp>
 #include <boost/smart_ptr/make_local_shared.hpp>
 
 #include <algorithm>
diff --git a/tt_metal/impl/allocator/algorithms/free_list.hpp b/tt_metal/impl/allocator/algorithms/free_list.hpp
index 5a3af36782b..bebb13ac2a8 100644
--- a/tt_metal/impl/allocator/algorithms/free_list.hpp
+++ b/tt_metal/impl/allocator/algorithms/free_list.hpp
@@ -5,7 +5,7 @@
 #include <string>
 
 #include "hostdevcommon/common_values.hpp"
-#include "tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp"
+#include <allocator_algorithm.hpp>
 #include <boost/smart_ptr/local_shared_ptr.hpp>
 
 namespace tt {
diff --git a/tt_metal/impl/allocator/algorithms/free_list_opt.cpp b/tt_metal/impl/allocator/algorithms/free_list_opt.cpp
index 19ca7d76b93..871481565ee 100644
--- a/tt_metal/impl/allocator/algorithms/free_list_opt.cpp
+++ b/tt_metal/impl/allocator/algorithms/free_list_opt.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "tt_metal/impl/allocator/algorithms/free_list_opt.hpp"
-#include "common/assert.hpp"
-#include "llrt/hal.hpp"
-#include "tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp"
+#include <assert.hpp>
+#include <hal.hpp>
+#include <allocator_algorithm.hpp>
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
diff --git a/tt_metal/impl/allocator/algorithms/free_list_opt.hpp b/tt_metal/impl/allocator/algorithms/free_list_opt.hpp
index 341c96561f3..949483877b8 100644
--- a/tt_metal/impl/allocator/algorithms/free_list_opt.hpp
+++ b/tt_metal/impl/allocator/algorithms/free_list_opt.hpp
@@ -8,7 +8,7 @@
 #include <vector>
 #include <optional>
 
-#include "tt_metal/impl/allocator/algorithms/allocator_algorithm.hpp"
+#include <allocator_algorithm.hpp>
 
 namespace tt {
 namespace tt_metal {
diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp
index c85294fce25..c626741533b 100644
--- a/tt_metal/impl/allocator/allocator.cpp
+++ b/tt_metal/impl/allocator/allocator.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/allocator/allocator.hpp"
+#include <allocator.hpp>
 
 #include <magic_enum/magic_enum.hpp>
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <math.hpp>
+#include <util.hpp>
 #include "tt_metal/impl/allocator/algorithms/free_list_opt.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
+#include <buffer.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/impl/allocator/basic_allocator.cpp b/tt_metal/impl/allocator/basic_allocator.cpp
index 6ae97354993..3c58cc42f8c 100644
--- a/tt_metal/impl/allocator/basic_allocator.cpp
+++ b/tt_metal/impl/allocator/basic_allocator.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/allocator/basic_allocator.hpp"
+#include <basic_allocator.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/impl/allocator/l1_banking_allocator.cpp b/tt_metal/impl/allocator/l1_banking_allocator.cpp
index c79cd949a44..d969d7ab4fa 100644
--- a/tt_metal/impl/allocator/l1_banking_allocator.cpp
+++ b/tt_metal/impl/allocator/l1_banking_allocator.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/allocator/l1_banking_allocator.hpp"
+#include <l1_banking_allocator.hpp>
 
 #include <algorithm>
 #include <cstddef>
@@ -13,15 +13,15 @@
 #include <unordered_map>
 #include <vector>
 
-#include "tt_metal/impl/allocator/allocator.hpp"
-#include "tt_metal/impl/allocator/allocator_types.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <allocator.hpp>
+#include <allocator_types.hpp>
+#include <buffer_constants.hpp>
+#include <assert.hpp>
+#include <core_coord.hpp>
 #include "umd/device/types/xy_pair.h"
 #include <fmt/base.h>
 
-#include "llrt/hal.hpp"
+#include <hal.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp
index f6092829b56..25e194dd41c 100644
--- a/tt_metal/impl/buffers/buffer.cpp
+++ b/tt_metal/impl/buffers/buffer.cpp
@@ -2,25 +2,25 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/buffers/buffer.hpp"
+#include <buffer.hpp>
 
 #include "tt_metal/buffer.hpp"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/allocator/allocator.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/types.hpp"
+#include <assert.hpp>
+#include <math.hpp>
+#include <tt_metal.hpp>
+#include <allocator.hpp>
+#include <device.hpp>
+#include <types.hpp>
 
 #include <algorithm>
 #include <atomic>
 #include <mutex>
 #include <utility>
-#include "tt_metal/common/base.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
+#include <base.hpp>
+#include <buffer_constants.hpp>
 #include "umd/device/tt_soc_descriptor.h"
 #include "fmt/base.h"
-#include "tt_stl/reflection.hpp"
+#include <reflection.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/impl/buffers/circular_buffer.cpp b/tt_metal/impl/buffers/circular_buffer.cpp
index 33c52523124..fbca1689347 100644
--- a/tt_metal/impl/buffers/circular_buffer.cpp
+++ b/tt_metal/impl/buffers/circular_buffer.cpp
@@ -2,15 +2,15 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
-
-#include "host_api.hpp"
-#include "llrt/llrt.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <circular_buffer.hpp>
+
+#include <host_api.hpp>
+#include <llrt.hpp>
+#include <buffer.hpp>
+#include <global_circular_buffer_impl.hpp>
+#include <tt_metal.hpp>
+#include <device.hpp>
+#include <command_queue.hpp>
 
 namespace {
 
diff --git a/tt_metal/impl/buffers/circular_buffer_types.cpp b/tt_metal/impl/buffers/circular_buffer_types.cpp
index 7877c264820..07be1fc60c4 100644
--- a/tt_metal/impl/buffers/circular_buffer_types.cpp
+++ b/tt_metal/impl/buffers/circular_buffer_types.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "circular_buffer_types.hpp"
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
+#include <global_circular_buffer_impl.hpp>
 
 namespace tt::tt_metal {
 inline namespace v0 {
diff --git a/tt_metal/impl/buffers/global_circular_buffer.cpp b/tt_metal/impl/buffers/global_circular_buffer.cpp
index 575737fd186..ceefce7f217 100644
--- a/tt_metal/impl/buffers/global_circular_buffer.cpp
+++ b/tt_metal/impl/buffers/global_circular_buffer.cpp
@@ -2,20 +2,20 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
+#include <global_circular_buffer_impl.hpp>
 
 #include <cstdint>
 #include <memory>
 #include <vector>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/llrt/hal.hpp"
+#include <assert.hpp>
+#include <core_coord.hpp>
+#include <tt_metal.hpp>
+#include <host_api.hpp>
+#include <buffer.hpp>
+#include <buffer_constants.hpp>
+#include <device.hpp>
+#include <hal.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/buffers/global_semaphore.cpp b/tt_metal/impl/buffers/global_semaphore.cpp
index f2a6e2474e5..90b3d4a1dc5 100644
--- a/tt_metal/impl/buffers/global_semaphore.cpp
+++ b/tt_metal/impl/buffers/global_semaphore.cpp
@@ -2,20 +2,20 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
+#include <global_semaphore.hpp>
 
 #include <cstdint>
 #include <memory>
 #include <vector>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/llrt/hal.hpp"
+#include <assert.hpp>
+#include <core_coord.hpp>
+#include <tt_metal.hpp>
+#include <host_api.hpp>
+#include <buffer.hpp>
+#include <buffer_constants.hpp>
+#include <device.hpp>
+#include <hal.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/buffers/semaphore.cpp b/tt_metal/impl/buffers/semaphore.cpp
index 401c1d53090..d2d1678effe 100644
--- a/tt_metal/impl/buffers/semaphore.cpp
+++ b/tt_metal/impl/buffers/semaphore.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/buffers/semaphore.hpp"
+#include <semaphore.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/impl/debug/debug_helpers.hpp b/tt_metal/impl/debug/debug_helpers.hpp
index 2289aed2198..9e1c79676f9 100644
--- a/tt_metal/impl/debug/debug_helpers.hpp
+++ b/tt_metal/impl/debug/debug_helpers.hpp
@@ -7,7 +7,7 @@
 #include <set>
 
 #include "hostdevcommon/dprint_common.h"
-#include "tt_metal/device.hpp"
+#include <device.hpp>
 
 // Helper function for comparing CoreDescriptors for using in sets.
 struct CoreDescriptorComparator {
diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp
index 18bb132e6a6..11db642d215 100644
--- a/tt_metal/impl/debug/dprint_server.cpp
+++ b/tt_metal/impl/debug/dprint_server.cpp
@@ -16,16 +16,16 @@
 #include <set>
 #include <filesystem>
 #include <tuple>
-#include "llrt/llrt.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <llrt.hpp>
+#include <logger.hpp>
 
 #include "dprint_server.hpp"
 #include "debug_helpers.hpp"
-#include "llrt/rtoptions.hpp"
-#include "common/bfloat8.hpp"
+#include <rtoptions.hpp>
+#include <bfloat8.hpp>
 
 #include "hostdevcommon/dprint_common.h"
-#include "tt_metal/device.hpp"
+#include <device.hpp>
 
 using std::cout;
 using std::endl;
diff --git a/tt_metal/impl/debug/noc_logging.cpp b/tt_metal/impl/debug/noc_logging.cpp
index 4c64aad7402..d9e25faa124 100644
--- a/tt_metal/impl/debug/noc_logging.cpp
+++ b/tt_metal/impl/debug/noc_logging.cpp
@@ -12,8 +12,8 @@
 
 #include "debug_helpers.hpp"
 #include "hostdevcommon/dprint_common.h"
-#include "tt_metal/device.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <device.hpp>
+#include <llrt.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/impl/debug/noc_logging.hpp b/tt_metal/impl/debug/noc_logging.hpp
index b7395b15d86..c67d246a48e 100644
--- a/tt_metal/impl/debug/noc_logging.hpp
+++ b/tt_metal/impl/debug/noc_logging.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/device.hpp"
+#include <device.hpp>
 
 namespace tt {
 void ClearNocData(tt_metal::IDevice* device);
diff --git a/tt_metal/impl/debug/sanitize_noc_host.hpp b/tt_metal/impl/debug/sanitize_noc_host.hpp
index e8bc762646d..bd1b07f6649 100644
--- a/tt_metal/impl/debug/sanitize_noc_host.hpp
+++ b/tt_metal/impl/debug/sanitize_noc_host.hpp
@@ -6,7 +6,7 @@
 
 #include <cstdint>
 
-#include "llrt/hal.hpp"
+#include <hal.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/impl/debug/watcher_device_reader.cpp b/tt_metal/impl/debug/watcher_device_reader.cpp
index fd86636d119..fe678bc3012 100644
--- a/tt_metal/impl/debug/watcher_device_reader.cpp
+++ b/tt_metal/impl/debug/watcher_device_reader.cpp
@@ -9,25 +9,25 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_set>
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/common/metal_soc_descriptor.h"
-#include "hw/inc/dev_msgs.h"
+#include <assert.hpp>
+#include <logger.hpp>
+#include <metal_soc_descriptor.h>
+#include <dev_msgs.h>
 
 #include "umd/device/types/arch.h"
 #include "umd/device/types/xy_pair.h"
 #include <fmt/base.h>
-#include "llrt/llrt.hpp"
-#include "llrt/tt_cluster.hpp"
+#include <llrt.hpp>
+#include <tt_cluster.hpp>
 
-#include "common/core_coord.hpp"
+#include <core_coord.hpp>
 #include "hw/inc/debug/ring_buffer.h"
-#include "impl/device/device.hpp"
-#include "llrt/rtoptions.hpp"
+#include <device_impl.hpp>
+#include <rtoptions.hpp>
 
 #include "watcher_device_reader.hpp"
 
-#include "llrt/hal.hpp"
+#include <hal.hpp>
 
 using namespace tt::tt_metal;
 using std::string;
diff --git a/tt_metal/impl/debug/watcher_device_reader.hpp b/tt_metal/impl/debug/watcher_device_reader.hpp
index 5d3b665b384..89c4144d509 100644
--- a/tt_metal/impl/debug/watcher_device_reader.hpp
+++ b/tt_metal/impl/debug/watcher_device_reader.hpp
@@ -13,13 +13,13 @@
 #include <string>
 #include <utility>
 #include <vector>
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/device.hpp"
+#include <core_coord.hpp>
+#include <device.hpp>
 #include "umd/device/tt_soc_descriptor.h"
-#include "llrt/hal.hpp"
+#include <hal.hpp>
 
 // FIXME: ARCH_NAME specific, needed for several pointer types here
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 
 namespace tt::watcher {
 
diff --git a/tt_metal/impl/debug/watcher_server.cpp b/tt_metal/impl/debug/watcher_server.cpp
index 3505b164fa6..f74c7e6009d 100644
--- a/tt_metal/impl/debug/watcher_server.cpp
+++ b/tt_metal/impl/debug/watcher_server.cpp
@@ -14,10 +14,10 @@
 #include <thread>
 #include <unordered_map>
 
-#include "llrt/hal.hpp"
-#include "dev_msgs.h"
-#include "llrt/llrt.hpp"
-#include "llrt/rtoptions.hpp"
+#include <hal.hpp>
+#include <dev_msgs.h>
+#include <llrt.hpp>
+#include <rtoptions.hpp>
 #include "debug/ring_buffer.h"
 #include "watcher_device_reader.hpp"
 
diff --git a/tt_metal/impl/debug/watcher_server.hpp b/tt_metal/impl/debug/watcher_server.hpp
index 7d8a29f513f..79f6680d4de 100644
--- a/tt_metal/impl/debug/watcher_server.hpp
+++ b/tt_metal/impl/debug/watcher_server.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/device.hpp"
+#include <device.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index a1978521a13..4cba3be103f 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -2,34 +2,34 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/device/device.hpp"
+#include <device_impl.hpp>
 
 #include <string>
 #include <thread>
 #include "tt_metal/deprecated/device.hpp"
 #include "common/core_assignment.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/trace/trace.hpp"
-#include "tt_metal/common/core_descriptor.hpp"
+#include <host_api.hpp>
+#include <trace.hpp>
+#include <core_descriptor.hpp>
 #include "tracy/Tracy.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "impl/debug/dprint_server.hpp"
+#include <tt_metal.hpp>
+#include <dprint_server.hpp>
 #include "impl/debug/watcher_server.hpp"
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
-#include "common/utils.hpp"
-#include "llrt/llrt.hpp"
-#include "dev_msgs.h"
-#include "tt_metal/impl/device/device_pool.hpp"
-#include "tt_metal/detail/persistent_kernel_cache.hpp"
+#include <utils.hpp>
+#include <llrt.hpp>
+#include <dev_msgs.h>
+#include <device_pool.hpp>
+#include <persistent_kernel_cache.hpp>
 #include "tt_metal/tools/profiler/tt_metal_tracy.hpp"
-#include "llrt/hal.hpp"
-#include "tt_metal/experimental/hal.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
-#include "tt_metal/impl/sub_device/sub_device_manager_tracker.hpp"
-#include "tt_metal/impl/sub_device/sub_device_manager.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
-#include "tt_metal/types.hpp"
+#include <hal.hpp>
+#include <hal_exp.hpp>
+#include <sub_device.hpp>
+#include <sub_device_manager_tracker.hpp>
+#include <sub_device_manager.hpp>
+#include <sub_device_types.hpp>
+#include <span.hpp>
+#include <types.hpp>
 #include "impl/dispatch/topology.hpp"
 
 namespace tt {
diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp
index 4ce1311617b..468e8b5e45a 100644
--- a/tt_metal/impl/device/device_pool.cpp
+++ b/tt_metal/impl/device/device_pool.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/device/device_pool.hpp"
-#include "tt_metal/impl/device/device.hpp"
+#include <device_pool.hpp>
+#include <device_impl.hpp>
 
 #include <numa.h>
 
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt_metal.hpp>
 #include "tt_metal/impl/debug/noc_logging.hpp"
 #include "tt_metal/impl/debug/watcher_server.hpp"
 #include "tt_metal/impl/dispatch/topology.hpp"
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index ed66184989f..c4cd5a92bd1 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <command_queue.hpp>
 
 #include <algorithm>
 #include <array>
@@ -17,29 +17,29 @@
 #include <utility>
 #include <variant>
 
-#include "buffers/buffer.hpp"
-#include "common/math.hpp"
-#include "dev_msgs.h"
-#include "llrt/hal.hpp"
+#include <buffer.hpp>
+#include <math.hpp>
+#include <dev_msgs.h>
+#include <hal.hpp>
 #include "program_command_sequence.hpp"
 #include "tt_metal/command_queue.hpp"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/hw/inc/circular_buffer_constants.h"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
-#include "tt_metal/impl/debug/dprint_server.hpp"
+#include <assert.hpp>
+#include <logger.hpp>
+#include <tt_metal.hpp>
+#include <host_api.hpp>
+#include <circular_buffer_constants.h>
+#include <circular_buffer.hpp>
+#include <dprint_server.hpp>
 #include "tt_metal/impl/debug/watcher_server.hpp"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <cq_commands.hpp>
 #include "tt_metal/impl/dispatch/data_collection.hpp"
-#include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
-#include "tt_metal/impl/event/event.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <dispatch_core_manager.hpp>
+#include <event.hpp>
+#include <kernel.hpp>
 #include "tt_metal/impl/program/dispatch.hpp"
 #include "umd/device/tt_xy_pair.h"
 
-#include "llrt/hal.hpp"
+#include <hal.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/impl/dispatch/data_collection.cpp b/tt_metal/impl/dispatch/data_collection.cpp
index e41a449f08a..7201d5ad2b8 100644
--- a/tt_metal/impl/dispatch/data_collection.cpp
+++ b/tt_metal/impl/dispatch/data_collection.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "data_collection.hpp"
-#include "llrt/rtoptions.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <rtoptions.hpp>
+#include <kernel.hpp>
+#include <core_coord.hpp>
 
 #include <magic_enum/magic_enum.hpp>
 
diff --git a/tt_metal/impl/dispatch/data_collection.hpp b/tt_metal/impl/dispatch/data_collection.hpp
index 80a2eba24c0..c835db0f20d 100644
--- a/tt_metal/impl/dispatch/data_collection.hpp
+++ b/tt_metal/impl/dispatch/data_collection.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
+#include <host_api.hpp>
+#include <device.hpp>
 #include "command_queue_interface.hpp"
 
 namespace tt {
diff --git a/tt_metal/impl/dispatch/debug_tools.hpp b/tt_metal/impl/dispatch/debug_tools.hpp
index 639ec1d88e7..519f433be8c 100644
--- a/tt_metal/impl/dispatch/debug_tools.hpp
+++ b/tt_metal/impl/dispatch/debug_tools.hpp
@@ -5,8 +5,8 @@
 #include <string>
 #include <fstream>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
+#include <host_api.hpp>
+#include <device.hpp>
 #include "command_queue_interface.hpp"
 
 namespace internal {
diff --git a/tt_metal/impl/dispatch/kernel_config/demux.cpp b/tt_metal/impl/dispatch/kernel_config/demux.cpp
index e0ecc9272bf..deecda652f7 100644
--- a/tt_metal/impl/dispatch/kernel_config/demux.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/demux.cpp
@@ -5,8 +5,8 @@
 #include "dispatch.hpp"
 #include "eth_tunneler.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <host_api.hpp>
+#include <tt_metal.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch.cpp b/tt_metal/impl/dispatch/kernel_config/dispatch.cpp
index 1bfbbd579a3..3bc767dc0d6 100644
--- a/tt_metal/impl/dispatch/kernel_config/dispatch.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/dispatch.cpp
@@ -7,8 +7,8 @@
 #include "demux.hpp"
 #include "mux.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <host_api.hpp>
+#include <tt_metal.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp b/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp
index 236cf78939e..8274291dc32 100644
--- a/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/dispatch_s.cpp
@@ -5,8 +5,8 @@
 #include "dispatch.hpp"
 #include "prefetch.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <host_api.hpp>
+#include <tt_metal.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/impl/dispatch/kernel_config/eth_router.cpp b/tt_metal/impl/dispatch/kernel_config/eth_router.cpp
index a4a11db626d..c87b2a22275 100644
--- a/tt_metal/impl/dispatch/kernel_config/eth_router.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/eth_router.cpp
@@ -5,8 +5,8 @@
 #include "prefetch.hpp"
 #include "eth_tunneler.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <host_api.hpp>
+#include <tt_metal.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp
index bd991bdb349..775959d5867 100644
--- a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp
@@ -6,8 +6,8 @@
 #include "demux.hpp"
 #include "mux.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <host_api.hpp>
+#include <tt_metal.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/impl/dispatch/kernel_config/fd_kernel.cpp b/tt_metal/impl/dispatch/kernel_config/fd_kernel.cpp
index 9105ffe797f..a33ddc24e73 100644
--- a/tt_metal/impl/dispatch/kernel_config/fd_kernel.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/fd_kernel.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "fd_kernel.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "impl/debug/dprint_server.hpp"
+#include <host_api.hpp>
+#include <tt_metal.hpp>
+#include <dprint_server.hpp>
 
 #include "prefetch.hpp"
 #include "dispatch.hpp"
diff --git a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp
index 8b92992f63a..33d394abf91 100644
--- a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp
+++ b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "impl/device/device.hpp"
-#include "impl/program/program.hpp"
+#include <device_impl.hpp>
+#include <program_impl.hpp>
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 
 #define UNUSED_LOGICAL_CORE tt_cxy_pair(device_->id(), 0, 0)
diff --git a/tt_metal/impl/dispatch/kernel_config/mux.cpp b/tt_metal/impl/dispatch/kernel_config/mux.cpp
index 094128e8413..f7d25173d3a 100644
--- a/tt_metal/impl/dispatch/kernel_config/mux.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/mux.cpp
@@ -6,8 +6,8 @@
 #include "eth_router.hpp"
 #include "eth_tunneler.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <host_api.hpp>
+#include <tt_metal.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/impl/dispatch/kernel_config/prefetch.cpp b/tt_metal/impl/dispatch/kernel_config/prefetch.cpp
index e7f86fe0581..23577523cf4 100644
--- a/tt_metal/impl/dispatch/kernel_config/prefetch.cpp
+++ b/tt_metal/impl/dispatch/kernel_config/prefetch.cpp
@@ -6,8 +6,8 @@
 #include "dispatch_s.hpp"
 #include "eth_router.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <host_api.hpp>
+#include <tt_metal.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
index a371e2b1319..4930eb9bb87 100644
--- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
+++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
@@ -12,7 +12,7 @@
 
 #include "debug/assert.h"
 #include "debug/dprint.h"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/cq_common.hpp"
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 
diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp
index 23eaea9e229..1520beb8d0c 100644
--- a/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp
+++ b/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp
@@ -13,7 +13,7 @@
 
 #include "debug/assert.h"
 #include "debug/dprint.h"
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/cq_common.hpp"
 
 // dispatch_s has a customized command buffer allocation for NOC 1.
diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
index 2ea83fda7f3..71a90be2797 100644
--- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
+++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
@@ -9,7 +9,7 @@
 //    double buffered ScratchBuf for out of band data (e.g., from DRAM)
 //  - syncs w/ dispatcher via 2 semaphores, page_ready, page_done
 
-#include "tt_metal/impl/dispatch/cq_commands.hpp"
+#include <cq_commands.hpp>
 #include "tt_metal/impl/dispatch/kernels/cq_common.hpp"
 #include "debug/dprint.h"
 #include "noc/noc_parameters.h"  // PCIE_ALIGNMENT
diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp
index f5b7bd4d034..ce15c23a345 100644
--- a/tt_metal/impl/dispatch/topology.cpp
+++ b/tt_metal/impl/dispatch/topology.cpp
@@ -4,8 +4,8 @@
 
 #include "topology.hpp"
 #include "kernel_config/fd_kernel.hpp"
-#include "impl/device/device_pool.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <device_pool.hpp>
+#include <tt_metal.hpp>
 #include "kernel_config/fd_kernel.hpp"
 #include "kernel_config/prefetch.hpp"
 #include "kernel_config/dispatch.hpp"
diff --git a/tt_metal/impl/dispatch/topology.hpp b/tt_metal/impl/dispatch/topology.hpp
index fb40bd23191..d1e032d329b 100644
--- a/tt_metal/impl/dispatch/topology.hpp
+++ b/tt_metal/impl/dispatch/topology.hpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
-#include "tt_metal/device.hpp"
+#include <device.hpp>
 
 // Create FD kernels for all given device ids. Creates all objects, but need to call create_and_compile_cq_program() use
 // a created Device to fill out the settings.
diff --git a/tt_metal/impl/dispatch/util/dispatch_settings.cpp b/tt_metal/impl/dispatch/util/dispatch_settings.cpp
index 92c7c1679fc..c346e4397e2 100644
--- a/tt_metal/impl/dispatch/util/dispatch_settings.cpp
+++ b/tt_metal/impl/dispatch/util/dispatch_settings.cpp
@@ -3,12 +3,12 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cstdint>
-#include "llrt/hal.hpp"
-#include "llrt/tt_cluster.hpp"
+#include <hal.hpp>
+#include <tt_cluster.hpp>
 #include "magic_enum/magic_enum.hpp"
 #include "umd/device/tt_core_coordinates.h"
-#include "include/dispatch_settings.hpp"
-#include "include/helpers.hpp"
+#include <dispatch_settings.hpp>
+#include <helpers.hpp>
 
 namespace tt::tt_metal::dispatch {
 
diff --git a/tt_metal/impl/dispatch/worker_config_buffer.cpp b/tt_metal/impl/dispatch/worker_config_buffer.cpp
index 4d69fdafd07..8525b3521a7 100644
--- a/tt_metal/impl/dispatch/worker_config_buffer.cpp
+++ b/tt_metal/impl/dispatch/worker_config_buffer.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/impl/dispatch/worker_config_buffer.hpp"
+#include <assert.hpp>
+#include <worker_config_buffer.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/impl/event/event.cpp b/tt_metal/impl/event/event.cpp
index 0a840279718..c39fb3ffa7b 100644
--- a/tt_metal/impl/event/event.cpp
+++ b/tt_metal/impl/event/event.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/event/event.hpp"
+#include <event.hpp>
 
 #include <thread>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <assert.hpp>
+#include <logger.hpp>
 #include "tt_metal/event.hpp"
 
 namespace tt::tt_metal {
diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp
index 8772de0a15f..e3a4d1b8552 100644
--- a/tt_metal/impl/kernels/kernel.cpp
+++ b/tt_metal/impl/kernels/kernel.cpp
@@ -2,20 +2,20 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <kernel.hpp>
 
 #include <fmt/core.h>
 #include <fmt/ranges.h>
 
 #include <set>
 
-#include "jit_build/build.hpp"
-#include "llrt/llrt.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#include <build.hpp>
+#include <llrt.hpp>
+#include <tt_metal.hpp>
 #include "tt_metal/impl/debug/watcher_server.hpp"
 #include "tt_metal/kernel.hpp"
-#include "tt_metal/common/utils.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <utils.hpp>
+#include <core_coord.hpp>
 #include "tt_metal/jit_build/genfiles.hpp"
 namespace tt {
 
diff --git a/tt_metal/impl/kernels/kernel_types.cpp b/tt_metal/impl/kernels/kernel_types.cpp
index d8a1efee880..907e80660b8 100644
--- a/tt_metal/impl/kernels/kernel_types.cpp
+++ b/tt_metal/impl/kernels/kernel_types.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/impl/kernels/kernel_types.hpp"
-#include "tt_metal/llrt/tt_cluster.hpp"
+#include <kernel_types.hpp>
+#include <tt_cluster.hpp>
 
 #include <utility>
 
diff --git a/tt_metal/impl/program/dispatch.cpp b/tt_metal/impl/program/dispatch.cpp
index 2fc2fbdf941..9386496e41e 100644
--- a/tt_metal/impl/program/dispatch.cpp
+++ b/tt_metal/impl/program/dispatch.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "tt_metal/impl/program/dispatch.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <command_queue.hpp>
 #include "tt_metal/distributed/mesh_command_queue.hpp"
 #include "tt_metal/impl/dispatch/data_collection.hpp"
 #include "tt_metal/distributed/mesh_workload.hpp"
diff --git a/tt_metal/impl/program/dispatch.hpp b/tt_metal/impl/program/dispatch.hpp
index 1f45215fa88..443bb40522a 100644
--- a/tt_metal/impl/program/dispatch.hpp
+++ b/tt_metal/impl/program/dispatch.hpp
@@ -4,11 +4,11 @@
 
 #pragma once
 
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/program/program.hpp"
-#include "tt_metal/impl/dispatch/worker_config_buffer.hpp"
+#include <circular_buffer.hpp>
+#include <device.hpp>
+#include <kernel.hpp>
+#include <program_impl.hpp>
+#include <worker_config_buffer.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index 9fdaa5f1bd1..9d78328ed6b 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -5,26 +5,26 @@
 #include <range/v3/view/filter.hpp>
 #include <range/v3/view/transform.hpp>
 
-#include "buffers/circular_buffer_types.hpp"
-#include "common/executor.hpp"
-#include "tools/profiler/profiler.hpp"
+#include <circular_buffer_types.hpp>
+#include <executor.hpp>
+#include <profiler.hpp>
 #include "tt_metal/detail/kernel_cache.hpp"
-#include "tt_metal/detail/persistent_kernel_cache.hpp"
-#include "tt_metal/detail/reports/compilation_reporter.hpp"
-#include "tt_metal/detail/reports/memory_reporter.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/graph/graph_tracking.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/allocator/allocator.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
-#include "tt_metal/impl/buffers/semaphore.hpp"
-#include "tt_metal/impl/debug/dprint_server.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/impl/dispatch/device_command.hpp"
+#include <persistent_kernel_cache.hpp>
+#include <compilation_reporter.hpp>
+#include <memory_reporter.hpp>
+#include <tt_metal.hpp>
+#include <graph_tracking.hpp>
+#include <host_api.hpp>
+#include <allocator.hpp>
+#include <circular_buffer.hpp>
+#include <semaphore.hpp>
+#include <dprint_server.hpp>
+#include <device.hpp>
+#include <command_queue.hpp>
+#include <device_command.hpp>
 #include "tt_metal/impl/program/dispatch.hpp"
 #include "tt_metal/jit_build/genfiles.hpp"
-#include "tt_metal/llrt/llrt.hpp"
+#include <llrt.hpp>
 #include "tt_metal/program.hpp"
 #include "tracy/Tracy.hpp"
 
diff --git a/tt_metal/impl/sub_device/sub_device.cpp b/tt_metal/impl/sub_device/sub_device.cpp
index 56e41cd227b..80c3040744a 100644
--- a/tt_metal/impl/sub_device/sub_device.cpp
+++ b/tt_metal/impl/sub_device/sub_device.cpp
@@ -6,11 +6,11 @@
 #include <array>
 #include <cstdint>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
-#include "tt_metal/llrt/hal.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include <assert.hpp>
+#include <core_coord.hpp>
+#include <sub_device.hpp>
+#include <hal.hpp>
+#include <span.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/sub_device/sub_device_manager.cpp b/tt_metal/impl/sub_device/sub_device_manager.cpp
index f2569556107..bf6998db1f6 100644
--- a/tt_metal/impl/sub_device/sub_device_manager.cpp
+++ b/tt_metal/impl/sub_device/sub_device_manager.cpp
@@ -4,19 +4,19 @@
 
 #include <vector>
 
-#include "tt_metal/impl/sub_device/sub_device_manager.hpp"
-
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/allocator/allocator.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
-#include "tt_metal/impl/kernels/data_types.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/impl/trace/trace.hpp"
-#include "tt_metal/impl/trace/trace_buffer.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include <sub_device_manager.hpp>
+
+#include <assert.hpp>
+#include <host_api.hpp>
+#include <allocator.hpp>
+#include <device.hpp>
+#include <command_queue_interface.hpp>
+#include <data_types.hpp>
+#include <sub_device.hpp>
+#include <sub_device_types.hpp>
+#include <trace.hpp>
+#include <trace_buffer.hpp>
+#include <span.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/sub_device/sub_device_manager_tracker.cpp b/tt_metal/impl/sub_device/sub_device_manager_tracker.cpp
index 00bc0079258..67d4d589b1e 100644
--- a/tt_metal/impl/sub_device/sub_device_manager_tracker.cpp
+++ b/tt_metal/impl/sub_device/sub_device_manager_tracker.cpp
@@ -7,17 +7,17 @@
 #include <unordered_set>
 #include <vector>
 
-#include "tt_metal/impl/sub_device/sub_device_manager_tracker.hpp"
-
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/allocator/allocator.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/impl/kernels/data_types.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
-#include "tt_metal/impl/sub_device/sub_device_manager.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include <sub_device_manager_tracker.hpp>
+
+#include <device.hpp>
+#include <allocator.hpp>
+#include <buffer_constants.hpp>
+#include <command_queue.hpp>
+#include <data_types.hpp>
+#include <sub_device.hpp>
+#include <sub_device_manager.hpp>
+#include <sub_device_types.hpp>
+#include <span.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/impl/trace/trace.cpp b/tt_metal/impl/trace/trace.cpp
index ec378a56ce0..d0f8082a048 100644
--- a/tt_metal/impl/trace/trace.cpp
+++ b/tt_metal/impl/trace/trace.cpp
@@ -2,16 +2,16 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "impl/trace/trace.hpp"
+#include <trace.hpp>
 
 #include <memory>
 
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/impl/trace/trace.hpp"
+#include <logger.hpp>
+#include <tt_metal.hpp>
+#include <host_api.hpp>
+#include <device.hpp>
+#include <command_queue.hpp>
+#include <trace.hpp>
 #include "tt_metal/trace.hpp"
 
 namespace {
diff --git a/tt_metal/impl/trace/trace_buffer.cpp b/tt_metal/impl/trace/trace_buffer.cpp
index b7e1ac0edfd..f9efe387ad9 100644
--- a/tt_metal/impl/trace/trace_buffer.cpp
+++ b/tt_metal/impl/trace/trace_buffer.cpp
@@ -5,7 +5,7 @@
 #include "trace_buffer.hpp"
 
 #include <utility>
-#include "tt_metal/device.hpp"
+#include <device.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/include/tt_metal/buffer.hpp b/tt_metal/include/tt_metal/buffer.hpp
index ea7d61f5c72..05157d7f475 100644
--- a/tt_metal/include/tt_metal/buffer.hpp
+++ b/tt_metal/include/tt_metal/buffer.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/types.hpp"
+#include <buffer.hpp>
+#include <types.hpp>
 //==================================================
 //                  BUFFER HANDLING
 //==================================================
diff --git a/tt_metal/include/tt_metal/deprecated/device.hpp b/tt_metal/include/tt_metal/deprecated/device.hpp
index 944fb6bb3e3..7f665693524 100644
--- a/tt_metal/include/tt_metal/deprecated/device.hpp
+++ b/tt_metal/include/tt_metal/deprecated/device.hpp
@@ -6,9 +6,9 @@
 
 #include <cstddef>
 
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/impl/dispatch/work_executor.hpp"
-#include "tt_metal/types.hpp"
+#include <buffer_constants.hpp>
+#include <work_executor.hpp>
+#include <types.hpp>
 
 //==================================================
 //               DEVICE MANAGEMENT
diff --git a/tt_metal/include/tt_metal/internal/buffer.hpp b/tt_metal/include/tt_metal/internal/buffer.hpp
index 43963b5aefc..d9864da4582 100644
--- a/tt_metal/include/tt_metal/internal/buffer.hpp
+++ b/tt_metal/include/tt_metal/internal/buffer.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 #include "types.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
+#include <buffer.hpp>
 //==================================================
 //                  BUFFER HANDLING
 //==================================================
diff --git a/tt_metal/include/tt_metal/internal/command_queue.hpp b/tt_metal/include/tt_metal/internal/command_queue.hpp
index f389df871be..3af0b92952c 100644
--- a/tt_metal/include/tt_metal/internal/command_queue.hpp
+++ b/tt_metal/include/tt_metal/internal/command_queue.hpp
@@ -6,7 +6,7 @@
 
 #include "types.hpp"
 
-#include "tt_metal/impl/buffers/buffer.hpp"
+#include <buffer.hpp>
 
 //==================================================
 //                COMMAND QUEUE OPERATIONS
diff --git a/tt_metal/include/tt_metal/internal/device.hpp b/tt_metal/include/tt_metal/internal/device.hpp
index a33d7684769..a570d5fd018 100644
--- a/tt_metal/include/tt_metal/internal/device.hpp
+++ b/tt_metal/include/tt_metal/internal/device.hpp
@@ -6,9 +6,9 @@
 
 #include <cstddef>
 #include "types.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/dispatch/work_executor.hpp"
+#include <buffer_constants.hpp>
+#include <buffer.hpp>
+#include <work_executor.hpp>
 
 //==================================================
 //               DEVICE MANAGEMENT
diff --git a/tt_metal/include/tt_metal/internal/kernel.hpp b/tt_metal/include/tt_metal/internal/kernel.hpp
index 58138f4324e..51ad8a9bff4 100644
--- a/tt_metal/include/tt_metal/internal/kernel.hpp
+++ b/tt_metal/include/tt_metal/internal/kernel.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "types.hpp"
-#include "tt_metal/impl/kernels/kernel_types.hpp"
+#include <kernel_types.hpp>
 
 //==================================================
 //                INTERNAL KERNEL EXECUTION
diff --git a/tt_metal/include/tt_metal/internal/program.hpp b/tt_metal/include/tt_metal/internal/program.hpp
index 00ccec5b390..e91a426bb78 100644
--- a/tt_metal/include/tt_metal/internal/program.hpp
+++ b/tt_metal/include/tt_metal/internal/program.hpp
@@ -7,8 +7,8 @@
 #include <string_view>
 #include "types.hpp"
 
-#include "tt_metal/impl/kernels/kernel_types.hpp"
-#include "tt_metal/impl/buffers/circular_buffer_types.hpp"
+#include <kernel_types.hpp>
+#include <circular_buffer_types.hpp>
 
 //==================================================
 //                  INTERNAL PROGRAM MANAGEMENT
diff --git a/tt_metal/include/tt_metal/internal/types.hpp b/tt_metal/include/tt_metal/internal/types.hpp
index 44554a6c617..eafc4bc6e99 100644
--- a/tt_metal/include/tt_metal/internal/types.hpp
+++ b/tt_metal/include/tt_metal/internal/types.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/types.hpp"
+#include <types.hpp>
 
 namespace tt::tt_metal {
 namespace v1 {}
diff --git a/tt_metal/include/tt_metal/program.hpp b/tt_metal/include/tt_metal/program.hpp
index 4c450817a09..e195c24b75e 100644
--- a/tt_metal/include/tt_metal/program.hpp
+++ b/tt_metal/include/tt_metal/program.hpp
@@ -7,8 +7,8 @@
 #include <string_view>
 #include "types.hpp"
 
-#include "tt_metal/impl/kernels/kernel_types.hpp"
-#include "tt_metal/impl/buffers/circular_buffer_types.hpp"
+#include <kernel_types.hpp>
+#include <circular_buffer_types.hpp>
 #include "tt_metal/tt_stl/any_range.hpp"
 
 //==================================================
diff --git a/tt_metal/jit_build/CMakeLists.txt b/tt_metal/jit_build/CMakeLists.txt
index 47cdc880417..8310f56b7a0 100644
--- a/tt_metal/jit_build/CMakeLists.txt
+++ b/tt_metal/jit_build/CMakeLists.txt
@@ -7,7 +7,15 @@ set(JIT_BUILD_SRCS
 )
 
 add_library(jit_build OBJECT ${JIT_BUILD_SRCS})
-target_link_libraries(jit_build PUBLIC common PRIVATE Tracy::TracyClient)
+target_link_libraries(
+    jit_build
+    PUBLIC
+        common
+    PRIVATE
+        Tracy::TracyClient
+        Taskflow::Taskflow
+        TT::Metalium::HostDevCommon
+)
 
 if(DEFINED VERSION_HASH)
     target_compile_definitions(jit_build PRIVATE "-DGIT_COMMIT_HASH=\"${VERSION_HASH}\"")
diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp
index 65492b885d5..07b7d6bdabe 100644
--- a/tt_metal/jit_build/build.cpp
+++ b/tt_metal/jit_build/build.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "jit_build/build.hpp"
+#include <build.hpp>
 
 #include <chrono>
 #include <filesystem>
@@ -14,13 +14,13 @@
 #include <string>
 #include <thread>
 
-#include "common/executor.hpp"
+#include <executor.hpp>
 #include "jit_build/genfiles.hpp"
 #include "jit_build/kernel_args.hpp"
-#include "tools/profiler/common.hpp"
-#include "tools/profiler/profiler_state.hpp"
-#include "tt_metal/impl/dispatch/command_queue_interface.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <common.hpp>
+#include <profiler_state.hpp>
+#include <command_queue_interface.hpp>
+#include <kernel.hpp>
 #include "tt_metal/llrt/tt_elffile.hpp"
 
 namespace fs = std::filesystem;
@@ -162,19 +162,22 @@ void JitBuildEnv::init(
 
     // Includes
     // TODO(pgk) this list is insane
-    this->includes_ = string("") + "-I. " + "-I.. " + "-I" + this->root_ + " " + "-I" + this->root_ + "tt_metal " +
-                      "-I" + this->root_ + "tt_metal/include " + "-I" + this->root_ + "tt_metal/hw/inc " + "-I" +
-                      this->root_ + "tt_metal/hostdevcommon/api " + "-I" + this->root_ + "tt_metal/hw/inc/debug " +
-                      "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + " " + "-I" + this->root_ +
-                      "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/" + this->arch_name_ + "_defines " + "-I" +
-                      this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/noc " + "-I" + this->root_ +
-                      "tt_metal/third_party/umd/device/api " + "-I" + this->root_ + "tt_metal/third_party/umd/device/" +
-                      this->arch_name_ + " " +  // TODO(fixme)
+    this->includes_ = string("") + "-I. " + "-I.. " + "-I" + this->root_ + " " + "-I" + this->root_ + "ttnn " + "-I" +
+                      this->root_ + "tt_metal " + "-I" + this->root_ + "tt_metal/include " + "-I" + this->root_ +
+                      "tt_metal/hw/inc " + "-I" + this->root_ + "tt_metal/hostdevcommon/api " + "-I" + this->root_ +
+                      "tt_metal/hw/inc/debug " + "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ +
+                      " " + "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/" +
+                      this->arch_name_ + "_defines " + "-I" + this->root_ + "tt_metal/hw/inc/" +
+                      this->aliased_arch_name_ + "/noc " + "-I" + this->root_ + "tt_metal/third_party/umd/device/api " +
+                      "-I" + this->root_ + "tt_metal/third_party/umd/device/" + this->arch_name_ + " " +  // TODO(fixme)
                       "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/metal/common " + "-I" +
                       this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/metal/llk_io " + "-I" + this->root_ +
                       "tt_metal/third_party/tt_llk_" + this->arch_name_ +
                       "/common/inc " +  // TODO(fixme) datamovement fw shouldn't read this
-                      "-I" + this->root_ + "tt_metal/third_party/tt_llk_" + this->arch_name_ + "/llk_lib ";
+                      "-I" + this->root_ + "tt_metal/api/" + this->aliased_arch_name_ + " " + "-I" + this->root_ +
+                      "tt_metal/api/" + this->aliased_arch_name_ + "/tt-metalium " + "-I" + this->root_ +
+                      "tt_metal/api/tt-metalium/ " + "-I" + this->root_ + "tt_metal/api/ " + "-I" + this->root_ +
+                      "tt_metal/third_party/tt_llk_" + this->arch_name_ + "/llk_lib ";
 
     this->lflags_ = common_flags;
     this->lflags_ += "-fno-exceptions -Wl,-z,max-page-size=16 -Wl,-z,common-page-size=16 -nostartfiles ";
diff --git a/tt_metal/jit_build/data_format.cpp b/tt_metal/jit_build/data_format.cpp
index e46ca513d52..bcb627b3dde 100644
--- a/tt_metal/jit_build/data_format.cpp
+++ b/tt_metal/jit_build/data_format.cpp
@@ -11,9 +11,9 @@
 #include <unordered_map>  // for unordered_map
 
 #include "fmt/base.h"                      // for format_string
-#include "tt_metal/common/assert.hpp"      // for tt_throw, TT_FATAL
-#include "tt_metal/common/base_types.hpp"  // for UnpackToDestMode
-#include "tt_metal/hw/inc/circular_buffer_constants.h"
+#include <assert.hpp>      // for tt_throw, TT_FATAL
+#include <base_types.hpp>  // for UnpackToDestMode
+#include <circular_buffer_constants.h>
 
 namespace tt {
 
diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp
index 2438180c454..a06b5a393b1 100644
--- a/tt_metal/jit_build/genfiles.cpp
+++ b/tt_metal/jit_build/genfiles.cpp
@@ -10,14 +10,14 @@
 #include <iostream>
 #include <utility>
 
-#include "common/tt_backend_api_types.hpp"
-#include "common/utils.hpp"
+#include <tt_backend_api_types.hpp>
+#include <utils.hpp>
 #include "hostdevcommon/common_values.hpp"
-#include "jit_build/build.hpp"
-#include "jit_build/data_format.hpp"
-#include "jit_build/settings.hpp"
+#include <build.hpp>
+#include <data_format.hpp>
+#include <settings.hpp>
 
-#include "tt_metal/hw/inc/circular_buffer_constants.h"
+#include <circular_buffer_constants.h>
 
 namespace fs = std::filesystem;
 
diff --git a/tt_metal/jit_build/genfiles.hpp b/tt_metal/jit_build/genfiles.hpp
index c21459daabd..9a2f816feb4 100644
--- a/tt_metal/jit_build/genfiles.hpp
+++ b/tt_metal/jit_build/genfiles.hpp
@@ -7,8 +7,8 @@
 #include <string>
 #include <vector>
 
-#include "common/core_coord.hpp"
-#include "impl/kernels/kernel.hpp"
+#include <core_coord.hpp>
+#include <kernel.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/jit_build/kernel_args.cpp b/tt_metal/jit_build/kernel_args.cpp
index e7715472a43..686b28469ba 100644
--- a/tt_metal/jit_build/kernel_args.cpp
+++ b/tt_metal/jit_build/kernel_args.cpp
@@ -6,9 +6,9 @@
 #include <map>
 #include <mutex>
 #include <fstream>
-#include "common/utils.hpp"
+#include <utils.hpp>
 
-#include "tt_metal/common/assert.hpp"
+#include <assert.hpp>
 
 using namespace std;
 
diff --git a/tt_metal/jit_build/settings.cpp b/tt_metal/jit_build/settings.cpp
index c929e6f88c6..07f10afba88 100644
--- a/tt_metal/jit_build/settings.cpp
+++ b/tt_metal/jit_build/settings.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "jit_build/settings.hpp"
-#include "jit_build/build.hpp"
+#include <settings.hpp>
+#include <build.hpp>
 #include <iostream>
 #include <string>
 
diff --git a/tt_metal/llrt/CMakeLists.txt b/tt_metal/llrt/CMakeLists.txt
index db315ff636e..c4fe8532035 100644
--- a/tt_metal/llrt/CMakeLists.txt
+++ b/tt_metal/llrt/CMakeLists.txt
@@ -13,9 +13,11 @@ add_library(HAL::grayskull ALIAS gs_hal)
 target_sources(gs_hal PRIVATE grayskull/gs_hal.cpp)
 target_include_directories(
     gs_hal
+    BEFORE
     PRIVATE
         ${HAL_INCLUDE_DIRS}
         ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/grayskull
+        ${PROJECT_SOURCE_DIR}/tt_metal/api/grayskull/tt-metalium
 )
 target_link_libraries(
     gs_hal
@@ -37,10 +39,12 @@ target_sources(
 )
 target_include_directories(
     wh_hal
+    BEFORE
     PRIVATE
         ${HAL_INCLUDE_DIRS}
         ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/wormhole
         ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/wormhole/wormhole_b0_defines
+        ${PROJECT_SOURCE_DIR}/tt_metal/api/wormhole/tt-metalium
 )
 target_link_libraries(
     wh_hal
@@ -62,9 +66,11 @@ target_sources(
 )
 target_include_directories(
     bh_hal
+    BEFORE
     PRIVATE
         ${HAL_INCLUDE_DIRS}
         ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/blackhole
+        ${PROJECT_SOURCE_DIR}/tt_metal/api/blackhole/tt-metalium
 )
 target_link_libraries(
     bh_hal
@@ -84,13 +90,12 @@ set(LLRT_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/hal.cpp
 )
 
-add_library(llrt STATIC ${LLRT_SRC}) # FIXME(14541): Should be OBJECT, but can't handle circular deps between Object libs
+add_library(llrt OBJECT ${LLRT_SRC})
 add_library(Metalium::Metal::LLRT ALIAS llrt)
 
 target_link_libraries(
     llrt
     PUBLIC
-        Metalium::Metal::Common
         umd::device
         Metalium::Metal::Hardware
     PRIVATE
@@ -98,5 +103,10 @@ target_link_libraries(
         HAL::wormhole
         HAL::blackhole
         Tracy::TracyClient
+        nlohmann_json::nlohmann_json
+        Reflect::Reflect
+        magic_enum
+        span
+        Metalium::Metal::Impl
 )
 target_compile_options(llrt PRIVATE -Wno-int-to-pointer-cast)
diff --git a/tt_metal/llrt/blackhole/bh_hal.cpp b/tt_metal/llrt/blackhole/bh_hal.cpp
index 45b0b1a605b..9da9dbaf725 100644
--- a/tt_metal/llrt/blackhole/bh_hal.cpp
+++ b/tt_metal/llrt/blackhole/bh_hal.cpp
@@ -7,7 +7,7 @@
 
 #include "core_config.h"  // ProgrammableCoreType
 #include "dev_mem_map.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc/noc_parameters.h"
 #include "noc/noc_overlay_parameters.h"
 #include "tensix.h"
diff --git a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp
index a97160e195d..97f97e9c5fe 100644
--- a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp
+++ b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp
@@ -10,7 +10,7 @@
 #include <vector>
 
 #include "core_config.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "eth_l1_address_map.h"
 
 #include "hal.hpp"
@@ -54,6 +54,8 @@ HalCoreInfoType create_active_eth_mem_map() {
         eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_BASE;
     mem_map_bases[static_cast<std::size_t>(HalL1MemAddrType::RETRAIN_COUNT)] =
         eth_l1_mem::address_map::RETRAIN_COUNT_ADDR;
+    mem_map_bases[static_cast<std::size_t>(HalL1MemAddrType::FABRIC_ROUTER_CONFIG)] =
+        eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE;
 
     std::vector<std::uint32_t> mem_map_sizes;
     mem_map_sizes.resize(static_cast<std::size_t>(HalL1MemAddrType::COUNT));
@@ -80,6 +82,8 @@ HalCoreInfoType create_active_eth_mem_map() {
     mem_map_sizes[static_cast<std::size_t>(HalL1MemAddrType::APP_ROUTING_INFO)] =
         eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_SIZE;
     mem_map_sizes[static_cast<std::size_t>(HalL1MemAddrType::RETRAIN_COUNT)] = sizeof(uint32_t);
+    mem_map_sizes[static_cast<std::size_t>(HalL1MemAddrType::FABRIC_ROUTER_CONFIG)] =
+        eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_SIZE;
 
     std::vector<std::vector<HalJitBuildConfig>> processor_classes(NumEthDispatchClasses - 1);
     std::vector<HalJitBuildConfig> processor_types(1);
diff --git a/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp
index 3b8a63b3ff9..727f738fc36 100644
--- a/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp
+++ b/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp
@@ -11,7 +11,7 @@
 
 #include "core_config.h"
 #include "dev_mem_map.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc/noc_parameters.h"
 
 #include "hal.hpp"
diff --git a/tt_metal/llrt/blackhole/bh_hal_tensix.cpp b/tt_metal/llrt/blackhole/bh_hal_tensix.cpp
index b0bad00256a..17ec5b2a4b8 100644
--- a/tt_metal/llrt/blackhole/bh_hal_tensix.cpp
+++ b/tt_metal/llrt/blackhole/bh_hal_tensix.cpp
@@ -9,7 +9,7 @@
 
 #include "core_config.h"
 #include "dev_mem_map.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc/noc_parameters.h"
 #include "tensix.h"
 
diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp
index e34e3b45080..28442356dcd 100644
--- a/tt_metal/llrt/grayskull/gs_hal.cpp
+++ b/tt_metal/llrt/grayskull/gs_hal.cpp
@@ -10,7 +10,7 @@
 
 #include "core_config.h"
 #include "dev_mem_map.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc/noc_parameters.h"
 #include "noc/noc_overlay_parameters.h"
 #include "tensix.h"
diff --git a/tt_metal/llrt/hal.cpp b/tt_metal/llrt/hal.cpp
index b6f0ac0d13d..1f8aa8c0b60 100644
--- a/tt_metal/llrt/hal.cpp
+++ b/tt_metal/llrt/hal.cpp
@@ -4,8 +4,8 @@
 
 #include "hal.hpp"
 
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/common/assert.hpp"
+#include <tt_backend_api_types.hpp>
+#include <assert.hpp>
 
 #include "get_platform_architecture.hpp"
 namespace tt {
diff --git a/tt_metal/llrt/hal_asserts.hpp b/tt_metal/llrt/hal_asserts.hpp
index fbc09d0bbcd..ff202e715e9 100644
--- a/tt_metal/llrt/hal_asserts.hpp
+++ b/tt_metal/llrt/hal_asserts.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "dev_mem_map.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc/noc_parameters.h"
 
 // Validate assumptions on mailbox layout on host compile
diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp
index ab3d5e84555..1ff7a03bd45 100644
--- a/tt_metal/llrt/llrt.cpp
+++ b/tt_metal/llrt/llrt.cpp
@@ -18,19 +18,19 @@
 #include <unordered_set>
 #include <utility>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <assert.hpp>
+#include <logger.hpp>
 
 #include "llrt.hpp"
-#include "llrt/rtoptions.hpp"
+#include <rtoptions.hpp>
 #include "hal.hpp"
 
-#include "jit_build/settings.hpp"
+#include <settings.hpp>
 
 #include <fmt/base.h>
 #include <fmt/ranges.h>
 
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 
 namespace tt {
 
diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp
index f75a84b7320..de27678c1f7 100644
--- a/tt_metal/llrt/rtoptions.cpp
+++ b/tt_metal/llrt/rtoptions.cpp
@@ -10,8 +10,8 @@
 #include <cstring>
 #include <string>
 
-#include "impl/debug/dprint_server.hpp"
-#include "tools/profiler/profiler_state.hpp"
+#include <dprint_server.hpp>
+#include <profiler_state.hpp>
 
 using std::vector;
 
diff --git a/tt_metal/llrt/tlb_config.cpp b/tt_metal/llrt/tlb_config.cpp
index c099e3191f5..e5459ca4c3d 100644
--- a/tt_metal/llrt/tlb_config.cpp
+++ b/tt_metal/llrt/tlb_config.cpp
@@ -7,7 +7,7 @@
 #include "umd/device/blackhole_implementation.h"
 #include "umd/device/grayskull_implementation.h"
 #include "umd/device/wormhole_implementation.h"
-#include "tt_metal/common/assert.hpp"
+#include <assert.hpp>
 
 namespace ll_api {
 
diff --git a/tt_metal/llrt/tlb_config.hpp b/tt_metal/llrt/tlb_config.hpp
index b92ddccfd42..1e636407785 100644
--- a/tt_metal/llrt/tlb_config.hpp
+++ b/tt_metal/llrt/tlb_config.hpp
@@ -5,8 +5,8 @@
 #pragma once
 
 #include "umd/device/device_api_metal.h"
-#include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/common/metal_soc_descriptor.h"
+#include <tt_backend_api_types.hpp>
+#include <metal_soc_descriptor.h>
 
 #include <unordered_map>
 
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index 892a90b1bf9..938a47e6c1d 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -21,11 +21,11 @@
 #include <vector>
 
 #include "fmt/base.h"
-#include "tt_metal/common/base.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/common/metal_soc_descriptor.h"
-#include "tt_metal/common/test_common.hpp"
-#include "tt_metal/common/tt_backend_api_types.hpp"
+#include <base.hpp>
+#include <logger.hpp>
+#include <metal_soc_descriptor.h>
+#include <test_common.hpp>
+#include <tt_backend_api_types.hpp>
 #include "umd/device/types/arch.h"
 #include "umd/device/tt_cluster_descriptor.h"
 #include "umd/device/types/cluster_descriptor_types.h"
@@ -35,17 +35,17 @@
 #include "umd/device/types/xy_pair.h"
 #include "umd/device/hugepage.h"
 
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 
-#include "llrt/hal.hpp"
+#include <hal.hpp>
 
 #include "tracy/Tracy.hpp"
 #include "umd/device/tt_simulation_device.h"
 
-#include "tt_metal/impl/debug/sanitize_noc_host.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
+#include <debug/sanitize_noc_host.hpp>
+#include <rtoptions.hpp>
 #include "tt_metal/llrt/tlb_config.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <core_coord.hpp>
 
 #include "get_platform_architecture.hpp"
 
diff --git a/tt_metal/llrt/tt_elffile.cpp b/tt_metal/llrt/tt_elffile.cpp
index f46ce1d2990..6a6d9f5602c 100644
--- a/tt_metal/llrt/tt_elffile.cpp
+++ b/tt_metal/llrt/tt_elffile.cpp
@@ -7,7 +7,7 @@
 #include <algorithm>
 #include <array>
 
-#include "common/assert.hpp"
+#include <assert.hpp>
 // C++
 #include <map>
 // C
diff --git a/tt_metal/llrt/tt_memory.cpp b/tt_metal/llrt/tt_memory.cpp
index 2d2f6bb09b8..92fe0405a02 100644
--- a/tt_metal/llrt/tt_memory.cpp
+++ b/tt_metal/llrt/tt_memory.cpp
@@ -9,7 +9,7 @@
 #include <limits>
 
 #include "tt_elffile.hpp"
-#include "tt_metal/common/assert.hpp"
+#include <assert.hpp>
 
 namespace ll_api {
 
diff --git a/tt_metal/llrt/wormhole/wh_hal.cpp b/tt_metal/llrt/wormhole/wh_hal.cpp
index 406c9e46a91..d1f82e96496 100644
--- a/tt_metal/llrt/wormhole/wh_hal.cpp
+++ b/tt_metal/llrt/wormhole/wh_hal.cpp
@@ -7,7 +7,7 @@
 
 #include "core_config.h"  // ProgrammableCoreType
 #include "dev_mem_map.h"  // MEM_LOCAL_BASE
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc/noc_parameters.h"
 #include "noc/noc_overlay_parameters.h"
 #include "tensix.h"
diff --git a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp
index 6ecaff8ad3e..6433df77487 100644
--- a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp
+++ b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp
@@ -7,7 +7,7 @@
 #include <cstdint>
 
 #include "core_config.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "eth_l1_address_map.h"
 
 #include "hal.hpp"
@@ -51,6 +51,8 @@ HalCoreInfoType create_active_eth_mem_map() {
         eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_BASE;
     mem_map_bases[static_cast<std::size_t>(HalL1MemAddrType::RETRAIN_COUNT)] =
         eth_l1_mem::address_map::RETRAIN_COUNT_ADDR;
+    mem_map_bases[static_cast<std::size_t>(HalL1MemAddrType::FABRIC_ROUTER_CONFIG)] =
+        eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE;
 
     std::vector<uint32_t> mem_map_sizes;
     mem_map_sizes.resize(static_cast<std::size_t>(HalL1MemAddrType::COUNT));
@@ -76,6 +78,8 @@ HalCoreInfoType create_active_eth_mem_map() {
     mem_map_sizes[static_cast<std::size_t>(HalL1MemAddrType::APP_ROUTING_INFO)] =
         eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_SIZE;
     mem_map_sizes[static_cast<std::size_t>(HalL1MemAddrType::RETRAIN_COUNT)] = sizeof(uint32_t);
+    mem_map_sizes[static_cast<std::size_t>(HalL1MemAddrType::FABRIC_ROUTER_CONFIG)] =
+        eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_SIZE;
 
     std::vector<std::vector<HalJitBuildConfig>> processor_classes(NumEthDispatchClasses);
     std::vector<HalJitBuildConfig> processor_types(1);
diff --git a/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp
index 01886700779..94be2343a5b 100644
--- a/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp
+++ b/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp
@@ -11,7 +11,7 @@
 
 #include "core_config.h"
 #include "dev_mem_map.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc/noc_parameters.h"
 
 #include "hal.hpp"
diff --git a/tt_metal/llrt/wormhole/wh_hal_tensix.cpp b/tt_metal/llrt/wormhole/wh_hal_tensix.cpp
index 4979fc54841..ee5f22cbe5e 100644
--- a/tt_metal/llrt/wormhole/wh_hal_tensix.cpp
+++ b/tt_metal/llrt/wormhole/wh_hal_tensix.cpp
@@ -10,7 +10,7 @@
 
 #include "core_config.h"
 #include "dev_mem_map.h"
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 #include "noc/noc_parameters.h"
 
 #include "hal.hpp"
diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp
index 3046f13f364..ec2d1a172b1 100644
--- a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp
+++ b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp b/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp
index 547515a987e..011a319d35b 100644
--- a/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp
+++ b/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "host_api.hpp"
-#include "impl/device/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device_impl.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp b/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp
index 2c067a68b0c..080f5507098 100644
--- a/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp
+++ b/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/core_coord.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/bfloat16.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
diff --git a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
index f175514d2fc..408a12cf658 100644
--- a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
+++ b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
@@ -6,10 +6,10 @@
 #include <functional>
 #include <random>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 
 #include <magic_enum/magic_enum.hpp>
 
diff --git a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
index ef3a205741e..89295112acc 100644
--- a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
+++ b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "common/bfloat16.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp b/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp
index 01ecf89ffd7..34d7cc5e282 100644
--- a/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp
+++ b/tt_metal/programming_examples/hello_world_compute_kernel/hello_world_compute_kernel.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tt_metal/programming_examples/hello_world_datamovement_kernel/hello_world_datamovement_kernel.cpp b/tt_metal/programming_examples/hello_world_datamovement_kernel/hello_world_datamovement_kernel.cpp
index 7a0e8bf1332..82f82e1d623 100644
--- a/tt_metal/programming_examples/hello_world_datamovement_kernel/hello_world_datamovement_kernel.cpp
+++ b/tt_metal/programming_examples/hello_world_datamovement_kernel/hello_world_datamovement_kernel.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tt_metal/programming_examples/hello_world_datatypes_kernel/hello_world_datatypes_kernel.cpp b/tt_metal/programming_examples/hello_world_datatypes_kernel/hello_world_datatypes_kernel.cpp
index 5201bd3cafe..e0ceb83c024 100644
--- a/tt_metal/programming_examples/hello_world_datatypes_kernel/hello_world_datatypes_kernel.cpp
+++ b/tt_metal/programming_examples/hello_world_datatypes_kernel/hello_world_datatypes_kernel.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tt_metal/programming_examples/loopback/loopback.cpp b/tt_metal/programming_examples/loopback/loopback.cpp
index 8e98b80100f..ea16382d364 100644
--- a/tt_metal/programming_examples/loopback/loopback.cpp
+++ b/tt_metal/programming_examples/loopback/loopback.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 /*
  * 1. Host writes data to buffer in DRAM
diff --git a/tt_metal/programming_examples/matmul_common/bmm_op.hpp b/tt_metal/programming_examples/matmul_common/bmm_op.hpp
index c1d118215ec..1fc691b7337 100644
--- a/tt_metal/programming_examples/matmul_common/bmm_op.hpp
+++ b/tt_metal/programming_examples/matmul_common/bmm_op.hpp
@@ -10,14 +10,14 @@
 #include <algorithm>
 #include <array>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/bfloat16.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/bfloat16.hpp>
 
 #include "umd/device/tt_xy_pair.h"
 #include <magic_enum/magic_enum.hpp>
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using std::pair;
 using CoreCoord = tt_xy_pair;
diff --git a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
index 55220973e16..0e52a13058d 100644
--- a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
+++ b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
@@ -2,16 +2,16 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/common/test_tiles.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/programming_examples/matmul_common/bmm_op.hpp"
-#include "tt_metal/common/tilize_untilize.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/work_split.hpp>
+#include <matmul_common/bmm_op.hpp>
+#include <tt-metalium/tilize_untilize.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt::constants;
 using namespace std;
diff --git a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
index bb3ee04d055..c7cd2197e10 100644
--- a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
+++ b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
@@ -2,16 +2,16 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/common/test_tiles.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/programming_examples/matmul_common/bmm_op.hpp"
-#include "tt_metal/common/tilize_untilize.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <matmul_common/bmm_op.hpp>
+#include <tt-metalium/tilize_untilize.hpp>
 
 using namespace tt::constants;
 using namespace std;
diff --git a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
index 82a527676bc..294cb2e5e33 100644
--- a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
+++ b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
@@ -2,17 +2,17 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/common/test_tiles.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/programming_examples/matmul_common/bmm_op.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <matmul_common/bmm_op.hpp>
 #include <algorithm>
-#include "tt_metal/common/tilize_untilize.hpp"
+#include <tt-metalium/tilize_untilize.hpp>
 
 using namespace tt::constants;
 using namespace std;
diff --git a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
index 1ee51da2a5e..5732990adaf 100644
--- a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
+++ b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
@@ -2,15 +2,15 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/common/test_tiles.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/programming_examples/matmul_common/bmm_op.hpp"
-#include "tt_metal/common/tilize_untilize.hpp"
-#include "impl/device/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <matmul_common/bmm_op.hpp>
+#include <tt-metalium/tilize_untilize.hpp>
+#include <tt-metalium/device_impl.hpp>
 
 using namespace tt::constants;
 using namespace std;
diff --git a/tt_metal/programming_examples/pad/pad_multi_core.cpp b/tt_metal/programming_examples/pad/pad_multi_core.cpp
index 03b0a74b27e..58eeda8ed8e 100644
--- a/tt_metal/programming_examples/pad/pad_multi_core.cpp
+++ b/tt_metal/programming_examples/pad/pad_multi_core.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp
index 8788b0d6571..c655b338c40 100644
--- a/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp
+++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count/test_custom_cycle_count.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 
diff --git a/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp b/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp
index b81b65b78ac..c5d9078d24c 100644
--- a/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp
+++ b/tt_metal/programming_examples/profiler/test_custom_cycle_count_slow_dispatch/test_custom_cycle_count_slow_dispatch.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 
diff --git a/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp b/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp
index cbab9f69e4c..57af4cb82f0 100644
--- a/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp
+++ b/tt_metal/programming_examples/profiler/test_dispatch_cores/test_dispatch_cores.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 
diff --git a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp
index ef0c12e4950..136a0d79397 100644
--- a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp
+++ b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
 #include "hostdevcommon/profiler_common.h"
 
 using namespace tt;
diff --git a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp
index f95c4454bfd..e7ee9493fc0 100644
--- a/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp
+++ b/tt_metal/programming_examples/profiler/test_multi_op/test_multi_op.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 
diff --git a/tt_metal/programming_examples/profiler/test_timestamped_events/test_timestamped_events.cpp b/tt_metal/programming_examples/profiler/test_timestamped_events/test_timestamped_events.cpp
index a21780f18b2..909976c2bd5 100644
--- a/tt_metal/programming_examples/profiler/test_timestamped_events/test_timestamped_events.cpp
+++ b/tt_metal/programming_examples/profiler/test_timestamped_events/test_timestamped_events.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
 #include "hostdevcommon/profiler_common.h"
 
 using namespace tt;
diff --git a/tt_metal/programming_examples/sharding/shard_data_rm.cpp b/tt_metal/programming_examples/sharding/shard_data_rm.cpp
index 81d2841f84c..6bcf9cf9385 100644
--- a/tt_metal/programming_examples/sharding/shard_data_rm.cpp
+++ b/tt_metal/programming_examples/sharding/shard_data_rm.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/tt_metal/programming_examples/vecadd_multi_core/vecadd_multi_core.cpp b/tt_metal/programming_examples/vecadd_multi_core/vecadd_multi_core.cpp
index d4306d3316f..8e44fc1295a 100644
--- a/tt_metal/programming_examples/vecadd_multi_core/vecadd_multi_core.cpp
+++ b/tt_metal/programming_examples/vecadd_multi_core/vecadd_multi_core.cpp
@@ -5,10 +5,10 @@
 // this programing example is based on the vecadd single core example in the
 // contributed folder it illustarted using multiple cores to perform vector
 // addition the program will use 4 cores to perform the vector addition
-#include "common/bfloat16.hpp"
-#include "common/core_coord.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/device/device.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device_impl.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
diff --git a/tt_metal/tools/CMakeLists.txt b/tt_metal/tools/CMakeLists.txt
index e6db8fa601c..b75c69a00b8 100644
--- a/tt_metal/tools/CMakeLists.txt
+++ b/tt_metal/tools/CMakeLists.txt
@@ -9,4 +9,6 @@ target_link_libraries(
     PUBLIC
         profiler
         llrt
+    PRIVATE
+        TT::Metalium::HostDevCommon
 )
diff --git a/tt_metal/tools/memset.cpp b/tt_metal/tools/memset.cpp
index 84a365704b9..d6bdc26c32a 100644
--- a/tt_metal/tools/memset.cpp
+++ b/tt_metal/tools/memset.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "llrt/llrt.hpp"
-#include "llrt/tt_cluster.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include <llrt.hpp>
+#include <tt_cluster.hpp>
+#include <span.hpp>
 
 #include <unistd.h>
 
diff --git a/tt_metal/tools/profiler/CMakeLists.txt b/tt_metal/tools/profiler/CMakeLists.txt
index d86b5f930d4..91c44d54afc 100644
--- a/tt_metal/tools/profiler/CMakeLists.txt
+++ b/tt_metal/tools/profiler/CMakeLists.txt
@@ -4,5 +4,12 @@ set(PROFILER_SRC
 )
 
 add_library(profiler OBJECT ${PROFILER_SRC})
-target_link_libraries(profiler PUBLIC common PRIVATE Tracy::TracyClient)
-target_include_directories(profiler PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../include")
+target_link_libraries(
+    profiler
+    PUBLIC
+        common
+    PRIVATE
+        Tracy::TracyClient
+        Taskflow::Taskflow
+        TT::Metalium::HostDevCommon
+)
diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp
index bd6837a73d5..91d87db28f5 100644
--- a/tt_metal/tools/profiler/kernel_profiler.hpp
+++ b/tt_metal/tools/profiler/kernel_profiler.hpp
@@ -17,7 +17,7 @@
 #include "hostdevcommon/profiler_common.h"
 #include "risc_attribs.h"
 
-#include "dev_msgs.h"
+#include <dev_msgs.h>
 
 #define DO_PRAGMA(x) _Pragma(#x)
 
diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp
index fed4951b2b1..1dd1743be92 100644
--- a/tt_metal/tools/profiler/profiler.cpp
+++ b/tt_metal/tools/profiler/profiler.cpp
@@ -7,16 +7,16 @@
 #include <iomanip>
 #include <filesystem>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tools/profiler/profiler.hpp"
-#include "tools/profiler/profiler_state.hpp"
-#include "tools/profiler/common.hpp"
+#include <host_api.hpp>
+#include <tt_metal.hpp>
+#include <profiler.hpp>
+#include <profiler_state.hpp>
+#include <common.hpp>
 #include "hostdevcommon/profiler_common.h"
-#include "llrt/rtoptions.hpp"
-#include "dev_msgs.h"
+#include <rtoptions.hpp>
+#include <dev_msgs.h>
 #include "tracy/Tracy.hpp"
-#include "tt_metal/device.hpp"
+#include <device.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp
index 3ec91874f83..9fb47386ea2 100644
--- a/tt_metal/tools/profiler/tt_metal_profiler.cpp
+++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp
@@ -5,17 +5,17 @@
 #include <thread>
 #include <cmath>
 
-#include "llrt/hal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "impl/debug/dprint_server.hpp"
+#include <hal.hpp>
+#include <host_api.hpp>
+#include <dprint_server.hpp>
 
-#include "tools/profiler/profiler.hpp"
+#include <profiler.hpp>
 #include "hostdevcommon/profiler_common.h"
 
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt_metal.hpp>
 
 #include "tracy/TracyTTDevice.hpp"
-#include "tt_metal/device.hpp"
+#include <device.hpp>
 
 namespace tt {
 
diff --git a/tt_metal/tools/tt_gdb/tt_gdb.cpp b/tt_metal/tools/tt_gdb/tt_gdb.cpp
index b6e22d27b7b..3684a8c39b7 100644
--- a/tt_metal/tools/tt_gdb/tt_gdb.cpp
+++ b/tt_metal/tools/tt_gdb/tt_gdb.cpp
@@ -10,7 +10,7 @@
 #include <filesystem>
 #include <thread>
 
-#include "tt_metal/device.hpp"
+#include <device.hpp>
 #include "build_kernels_for_riscv/build_kernel_options.hpp"
 
 #include "tt_gdb.hpp"
diff --git a/tt_metal/tools/tt_gdb/tt_gdb.hpp b/tt_metal/tools/tt_gdb/tt_gdb.hpp
index 910eb085fdb..6114a563822 100644
--- a/tt_metal/tools/tt_gdb/tt_gdb.hpp
+++ b/tt_metal/tools/tt_gdb/tt_gdb.hpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <string>
-#include "llrt/tt_cluster.hpp"
-#include "llrt/llrt.hpp"
-#include "tt_metal/device.hpp"
+#include <tt_cluster.hpp>
+#include <llrt.hpp>
+#include <device.hpp>
 
 namespace tt_gdb {
 // Debugger info for UI
diff --git a/tt_metal/tools/watcher_dump/watcher_dump.cpp b/tt_metal/tools/watcher_dump/watcher_dump.cpp
index 894a16572ad..3fc80b9597b 100644
--- a/tt_metal/tools/watcher_dump/watcher_dump.cpp
+++ b/tt_metal/tools/watcher_dump/watcher_dump.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <iostream>
 #include <filesystem>
-#include "tt_metal/host_api.hpp"
+#include <host_api.hpp>
 #include "impl/debug/watcher_server.hpp"
 #include "impl/debug/noc_logging.hpp"
 #include "impl/dispatch/debug_tools.hpp"
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index 7a2c0ac5877..ab8624cad4d 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/detail/tt_metal.hpp"
+#include <tt_metal.hpp>
 
 #include <algorithm>
 #include <filesystem>
@@ -12,28 +12,28 @@
 #include <unordered_set>
 #include <utility>
 
-#include "dev_msgs.h"
-#include "llrt/hal.hpp"
-#include "impl/allocator/allocator.hpp"
-#include "impl/debug/dprint_server.hpp"
-#include "impl/dispatch/command_queue.hpp"
-#include "tools/profiler/profiler.hpp"
-
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/hw/inc/circular_buffer_constants.h"
-#include "tt_metal/impl/trace/trace.hpp"
-#include "tt_metal/impl/device/device.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/impl/sub_device/sub_device_types.hpp"
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <dev_msgs.h>
+#include <hal.hpp>
+#include <allocator.hpp>
+#include <dprint_server.hpp>
+#include <command_queue.hpp>
+#include <profiler.hpp>
+
+#include <host_api.hpp>
+#include <circular_buffer_constants.h>
+#include <trace.hpp>
+#include <device_impl.hpp>
+#include <device_pool.hpp>
+#include <kernel.hpp>
+#include <circular_buffer.hpp>
+#include <global_circular_buffer_impl.hpp>
+#include <global_semaphore.hpp>
+#include <sub_device_types.hpp>
+#include <global_circular_buffer.hpp>
 #include "tt_metal/include/tt_metal/program.hpp"
 #include "tracy/Tracy.hpp"
 
-#include "tt_metal/graph/graph_tracking.hpp"
+#include <graph_tracking.hpp>
 
 namespace tt {
 
diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt
index efc4c255d06..1d1d741e8f6 100644
--- a/ttnn/CMakeLists.txt
+++ b/ttnn/CMakeLists.txt
@@ -662,8 +662,6 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/cc
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/deprecated)
 
 set(TTNN_PUBLIC_INCLUDE_DIRS
-    ${PROJECT_SOURCE_DIR}
-    ${PROJECT_SOURCE_DIR}/tt_metal
     ${CMAKE_CURRENT_SOURCE_DIR} # ${PROJECT_SOURCE_DIR}/ttnn
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/deprecated # symlink to tt_eager; should become native folder once merge complete
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp
@@ -679,9 +677,9 @@ set(TTNN_PUBLIC_LINK_LIBRARIES
 set(TTNN_PUBLIC_LINK_DIRS "")
 
 set(TTNN_PRECOMPILED_HEADERS
-    ${PROJECT_SOURCE_DIR}/tt_metal/tt_stl/reflection.hpp
-    ${PROJECT_SOURCE_DIR}/ttnn/cpp/ttnn/operation.hpp
-    ${PROJECT_SOURCE_DIR}/ttnn/cpp/ttnn/any_device.hpp
+    ${PROJECT_SOURCE_DIR}/tt_metal/api/tt-metalium/reflection.hpp
+    cpp/ttnn/operation.hpp
+    cpp/ttnn/any_device.hpp
     ${PROJECT_SOURCE_DIR}/tt_metal/third_party/tracy/public/tracy/Tracy.hpp
     <functional>
     <map>
diff --git a/ttnn/cpp/pybind11/device.cpp b/ttnn/cpp/pybind11/device.cpp
index b210b24e6db..eaf585af0b0 100644
--- a/ttnn/cpp/pybind11/device.cpp
+++ b/ttnn/cpp/pybind11/device.cpp
@@ -8,15 +8,15 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "tt_metal/detail/persistent_kernel_cache.hpp"
-#include "tt_metal/detail/reports/compilation_reporter.hpp"
-#include "tt_metal/detail/reports/memory_reporter.hpp"
-#include "tt_metal/impl/device/device.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/trace/trace.hpp"
+#include <tt-metalium/persistent_kernel_cache.hpp>
+#include <tt-metalium/compilation_reporter.hpp>
+#include <tt-metalium/memory_reporter.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/trace.hpp>
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
-#include "tt_metal/experimental/hal.hpp"
+#include <tt-metalium/hal_exp.hpp>
 using namespace tt::tt_metal;
 
 namespace py = pybind11;
diff --git a/ttnn/cpp/pybind11/events.cpp b/ttnn/cpp/pybind11/events.cpp
index 1ea4deed7f4..5cf1d17b149 100644
--- a/ttnn/cpp/pybind11/events.cpp
+++ b/ttnn/cpp/pybind11/events.cpp
@@ -4,7 +4,7 @@
 
 #include "events.hpp"
 
-#include "tt_metal/impl/event/event.hpp"
+#include <tt-metalium/event.hpp>
 #include "pybind11/pybind11.h"
 #include <pybind11/stl.h>
 
diff --git a/ttnn/cpp/pybind11/global_circular_buffer.cpp b/ttnn/cpp/pybind11/global_circular_buffer.cpp
index ed5222b959b..ce73ea3f358 100644
--- a/ttnn/cpp/pybind11/global_circular_buffer.cpp
+++ b/ttnn/cpp/pybind11/global_circular_buffer.cpp
@@ -4,8 +4,8 @@
 
 #include "global_circular_buffer.hpp"
 
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "ttnn/cpp/ttnn/global_circular_buffer.hpp"
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include "cpp/ttnn/global_circular_buffer.hpp"
 #include "pybind11/pybind11.h"
 
 namespace ttnn::global_circular_buffer {
diff --git a/ttnn/cpp/pybind11/global_semaphore.cpp b/ttnn/cpp/pybind11/global_semaphore.cpp
index f4049871802..bf9f82673c7 100644
--- a/ttnn/cpp/pybind11/global_semaphore.cpp
+++ b/ttnn/cpp/pybind11/global_semaphore.cpp
@@ -4,8 +4,8 @@
 
 #include "global_semaphore.hpp"
 
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include <tt-metalium/global_semaphore.hpp>
+#include "cpp/ttnn/global_semaphore.hpp"
 #include "pybind11/pybind11.h"
 
 namespace ttnn::global_semaphore {
diff --git a/ttnn/cpp/pybind11/json_class.hpp b/ttnn/cpp/pybind11/json_class.hpp
index 2e53f779314..c88092a4944 100644
--- a/ttnn/cpp/pybind11/json_class.hpp
+++ b/ttnn/cpp/pybind11/json_class.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/reflection.hpp>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
diff --git a/ttnn/cpp/pybind11/operations/__init__.hpp b/ttnn/cpp/pybind11/operations/__init__.hpp
index c0fdee68183..76cd7a8ddeb 100644
--- a/ttnn/cpp/pybind11/operations/__init__.hpp
+++ b/ttnn/cpp/pybind11/operations/__init__.hpp
@@ -11,7 +11,7 @@
 #include "pybind11/operations/core.hpp"
 #include "pybind11/operations/creation.hpp"
 #include "ttnn/operations/bernoulli/bernoulli_pybind.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_pybind.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_pybind.hpp"
 #include "ttnn/operations/conv/conv_pybind.hpp"
 #include "ttnn/operations/data_movement/data_movement_pybind.hpp"
 #include "ttnn/operations/eltwise/binary/binary_pybind.hpp"
diff --git a/ttnn/cpp/pybind11/operations/copy.hpp b/ttnn/cpp/pybind11/operations/copy.hpp
index e6fbe7f477e..008c0ab9601 100644
--- a/ttnn/cpp/pybind11/operations/copy.hpp
+++ b/ttnn/cpp/pybind11/operations/copy.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/copy.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/pybind11/operations/core.hpp b/ttnn/cpp/pybind11/operations/core.hpp
index 6171714c710..7ce797f348b 100644
--- a/ttnn/cpp/pybind11/operations/core.hpp
+++ b/ttnn/cpp/pybind11/operations/core.hpp
@@ -9,9 +9,9 @@
 #include <optional>
 
 #include "pybind11/cast.h"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/core/core.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/pybind11/operations/creation.hpp b/ttnn/cpp/pybind11/operations/creation.hpp
index 93c690d9324..ea196005930 100644
--- a/ttnn/cpp/pybind11/operations/creation.hpp
+++ b/ttnn/cpp/pybind11/operations/creation.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/creation.hpp"
 
 namespace py = pybind11;
diff --git a/ttnn/cpp/pybind11/profiler.cpp b/ttnn/cpp/pybind11/profiler.cpp
index 8d14bf6c93c..baaa64f834a 100644
--- a/ttnn/cpp/pybind11/profiler.cpp
+++ b/ttnn/cpp/pybind11/profiler.cpp
@@ -6,7 +6,7 @@
 #include <pybind11/stl.h>
 
 #include "profiler.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
+#include "tools/profiler/op_profiler.hpp"
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp
index 05d3810fede..65b05f2f5c9 100644
--- a/ttnn/cpp/pybind11/pytensor.cpp
+++ b/ttnn/cpp/pybind11/pytensor.cpp
@@ -10,16 +10,16 @@
 #include <memory>
 
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/graph/graph_tracking.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/tt_stl/overloaded.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
+#include <tt-metalium/graph_tracking.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/overloaded.hpp>
 #include "ttnn/core.hpp"
 #include "ttnn/run_operation.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/tensor/tensor_ops.hpp"
+#include "tools/profiler/op_profiler.hpp"
 
 using namespace tt::tt_metal;
 
@@ -82,65 +82,6 @@ Tensor create_owned_tensor(T* data_ptr, const ttnn::TensorSpec& tensor_spec) {
     return Tensor(std::move(storage), tensor_spec);
 }
 
-OwnedBuffer create_owned_buffer_from_vector_of_floats(std::vector<float>&& data, DataType data_type) {
-    switch (data_type) {
-        case DataType::FLOAT32: {
-            return owned_buffer::create<float>(std::move(data));
-        }
-        case DataType::BFLOAT16: {
-            std::vector<::bfloat16> bfloat16_data(data.size());
-            std::transform(std::begin(data), std::end(data), std::begin(bfloat16_data), [](float value) {
-                return ::bfloat16(value);
-            });
-            return owned_buffer::create<::bfloat16>(std::move(bfloat16_data));
-        }
-        default: {
-            TT_THROW("Cannot create a host buffer!");
-        }
-    }
-}
-
-Tensor convert_float_vector_to_tt_tensor(
-    std::vector<float>&& data,
-    const std::array<uint32_t, 4>& shape,
-    DataType data_type,
-    Layout layout,
-    IDevice* device,
-    const std::optional<MemoryConfig>& memory_config,
-    const std::optional<Tile>& tile) {
-    if (data_type == DataType::BFLOAT8_B || data_type == DataType::BFLOAT4_B) {
-        TT_FATAL(layout == Layout::TILE, "Tile layout is required for BFLOAT8_B and BFLOAT4_B; got {}", layout);
-        auto result_cpu_spec = TensorSpec(
-            ttnn::SimpleShape(shape), TensorLayout(data_type, PageConfig(Layout::TILE, tile), MemoryConfig{}));
-
-        auto owned_buffer = create_owned_buffer_from_vector_of_floats(std::move(data), DataType::FLOAT32);
-        auto float_tensor = Tensor(OwnedStorage{owned_buffer}, shape, DataType::FLOAT32, Layout::ROW_MAJOR, tile);
-        if (result_cpu_spec.logical_shape() != result_cpu_spec.padded_shape()) {
-            float_tensor =
-                tensor_ops::tensor_pad(float_tensor, result_cpu_spec.padded_shape(), ttnn::SimpleShape{0, 0, 0, 0}, 0);
-        }
-        auto output_float_data = owned_buffer::get_as<float>(float_tensor.to(Layout::TILE)).get();
-        auto output_packed_data =
-            data_type == DataType::BFLOAT8_B
-                ? pack_fp32_vec_as_bfp8_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false, tile)
-                : pack_fp32_vec_as_bfp4_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false, tile);
-        auto output_buffer = owned_buffer::create<uint32_t>(std::move(output_packed_data));
-        auto tensor = Tensor(std::move(OwnedStorage{std::move(output_buffer)}), result_cpu_spec);
-        if (device) {
-            return tensor.to(device, memory_config.value_or(MemoryConfig{}));
-        }
-        return tensor;
-    }
-    auto result_cpu_spec = TensorSpec(
-        ttnn::SimpleShape(shape), TensorLayout(data_type, PageConfig(Layout::ROW_MAJOR, tile), MemoryConfig{}));
-    auto owned_buffer = create_owned_buffer_from_vector_of_floats(std::move(data), data_type);
-    auto tensor = Tensor(OwnedStorage{owned_buffer}, result_cpu_spec).to(layout);
-    if (device) {
-        return tensor.to(device, memory_config.value_or(MemoryConfig{}));
-    }
-    return tensor;
-}
-
 Tensor create_tt_tensor_from_py_data(
     std::size_t py_data_ptr,
     const TensorSpec& tensor_spec,
@@ -816,8 +757,10 @@ void pytensor_module(py::module& m_tensor) {
                           DataType data_type,
                           Layout layout,
                           const std::optional<Tile>& tile) {
-                return detail::convert_float_vector_to_tt_tensor(
-                    std::move(data), shape, data_type, layout, nullptr, std::nullopt, tile);
+                return Tensor::from_vector(
+                    std::move(data),
+                    TensorSpec(
+                        ttnn::SimpleShape(shape), TensorLayout(data_type, PageConfig(layout, tile), MemoryConfig{})));
             }),
             py::arg("data"),
             py::arg("shape"),
@@ -857,8 +800,11 @@ void pytensor_module(py::module& m_tensor) {
                           Layout layout,
                           IDevice* device,
                           const std::optional<Tile>& tile) {
-                return detail::convert_float_vector_to_tt_tensor(
-                    std::move(data), shape, data_type, layout, device, std::nullopt, tile);
+                return Tensor::from_vector(
+                    std::move(data),
+                    TensorSpec(
+                        ttnn::SimpleShape(shape), TensorLayout(data_type, PageConfig(layout, tile), MemoryConfig{})),
+                    device == nullptr ? std::nullopt : std::optional<ttnn::AnyDevice>(device));
             }),
             py::keep_alive<1, 6>(),
             py::arg("data"),
@@ -910,8 +856,11 @@ void pytensor_module(py::module& m_tensor) {
                           IDevice* device,
                           const MemoryConfig& memory_config,
                           const std::optional<Tile>& tile) {
-                return detail::convert_float_vector_to_tt_tensor(
-                    std::move(data), shape, data_type, layout, device, memory_config, tile);
+                return Tensor::from_vector(
+                    std::move(data),
+                    TensorSpec(
+                        ttnn::SimpleShape(shape), TensorLayout(data_type, PageConfig(layout, tile), memory_config)),
+                    device == nullptr ? std::nullopt : std::optional<ttnn::AnyDevice>(device));
             }),
             py::keep_alive<1, 7>(),
             py::arg("data"),
diff --git a/ttnn/cpp/pybind11/tensor.cpp b/ttnn/cpp/pybind11/tensor.cpp
index dd60bb08cc7..388117727bb 100644
--- a/ttnn/cpp/pybind11/tensor.cpp
+++ b/ttnn/cpp/pybind11/tensor.cpp
@@ -9,7 +9,7 @@
 #include <utility>
 
 #include "tensor.hpp"
-#include "ttnn/cpp/pybind11/json_class.hpp"
+#include "cpp/pybind11/json_class.hpp"
 #include "export_enum.hpp"
 
 #include "ttnn/tensor/host_buffer/types.hpp"
@@ -18,7 +18,7 @@
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/distributed/types.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/any_device.hpp b/ttnn/cpp/ttnn/any_device.hpp
index cde16b91480..20e368b3390 100644
--- a/ttnn/cpp/ttnn/any_device.hpp
+++ b/ttnn/cpp/ttnn/any_device.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "tt_metal/distributed/mesh_device.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/mesh_device.hpp>
+#include <tt-metalium/device.hpp>
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/config.hpp b/ttnn/cpp/ttnn/config.hpp
index 0b08ddcd1b8..425a1cd36a1 100644
--- a/ttnn/cpp/ttnn/config.hpp
+++ b/ttnn/cpp/ttnn/config.hpp
@@ -11,8 +11,8 @@
 #include <string_view>
 #include <tuple>
 
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/reflection.hpp>
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/decorators.hpp b/ttnn/cpp/ttnn/decorators.hpp
index 5786af382af..9ae2c249fc1 100644
--- a/ttnn/cpp/ttnn/decorators.hpp
+++ b/ttnn/cpp/ttnn/decorators.hpp
@@ -6,8 +6,8 @@
 
 #include <reflect>
 
-#include "tt_metal/graph/graph_tracking.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
+#include <tt-metalium/graph_tracking.hpp>
+#include <tracy/Tracy.hpp>
 #include "ttnn/common/constants.hpp"
 #include "ttnn/core.hpp"
 #include "ttnn/device_operation.hpp"
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/kernels/moreh_matmul.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/kernels/moreh_matmul.cpp
index 3302b013211..8d40e21c9e5 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/kernels/moreh_matmul.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/kernels/moreh_matmul.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 // Implemented based on bmm.cpp
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 #include "compute_kernel_api/matmul.h"
 #include "compute_kernel_api/transpose_wh.h"
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/kernels/moreh_sum_nc.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/kernels/moreh_sum_nc.cpp
index 7459e4bb401..7ac9928ca1b 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/kernels/moreh_sum_nc.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/kernels/moreh_sum_nc.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/device.cpp b/ttnn/cpp/ttnn/device.cpp
index 7be3dc9ff5f..c06ba914372 100644
--- a/ttnn/cpp/ttnn/device.cpp
+++ b/ttnn/cpp/ttnn/device.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/device.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
+#include <tt-metalium/device_pool.hpp>
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp
index f9563923312..f18135658e3 100644
--- a/ttnn/cpp/ttnn/device_operation.hpp
+++ b/ttnn/cpp/ttnn/device_operation.hpp
@@ -8,13 +8,14 @@
 #include <optional>
 #include "ttnn/tensor/tensor.hpp"
 
-#include "tt_metal/impl/device/program_cache.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
-#include "tt_stl/reflection.hpp"
-#include "tt_metal/graph/graph_tracking.hpp"
+#include <tt-metalium/program_cache.hpp>
+#include <tracy/Tracy.hpp>
+#include "tools/profiler/op_profiler.hpp"
+#include <tt-metalium/reflection.hpp>
+#include <tt-metalium/graph_tracking.hpp>
 #include "ttnn/core.hpp"
 #include "ttnn/distributed/api.hpp"
+#include "tools/profiler/op_profiler.hpp"
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/distributed/api.cpp b/ttnn/cpp/ttnn/distributed/api.cpp
index 9a6c938baf1..c15972572d6 100644
--- a/ttnn/cpp/ttnn/distributed/api.cpp
+++ b/ttnn/cpp/ttnn/distributed/api.cpp
@@ -6,12 +6,12 @@
 
 #include <memory>
 
-#include "tt_metal/tt_stl/overloaded.hpp"
+#include <tt-metalium/overloaded.hpp>
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/distributed/distributed_tensor_config.hpp"
-#include "tt_metal/distributed/mesh_device.hpp"
-#include "tt_metal/distributed/system_mesh.hpp"
+#include <tt-metalium/mesh_device.hpp>
+#include <tt-metalium/system_mesh.hpp>
 #include "ttnn/distributed/distributed_tensor_config.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp
index 2c32063f199..0f8042986ea 100644
--- a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp
+++ b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp
@@ -9,7 +9,7 @@
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/types.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/command_queue.hpp>
 #include "pybind11/stl.h"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp b/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp
index e8716199a63..a46e66ff35f 100644
--- a/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp
+++ b/ttnn/cpp/ttnn/distributed/distributed_tensor.cpp
@@ -6,7 +6,7 @@
 
 #include "ttnn/distributed/api.hpp"
 #include "ttnn/distributed/distributed_tensor.hpp"
-#include "common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 #include "ttnn/distributed/distributed_tensor_config.hpp"
 #include "ttnn/distributed/types.hpp"
 #include "ttnn/tensor/xtensor/partition.hpp"
@@ -18,7 +18,7 @@ class ReplicateTensorToMesh : public TensorToMesh {
 public:
     ReplicateTensorToMesh(size_t num_devices) : num_devices_(num_devices) {}
 
-    std::vector<Tensor> map(const Tensor& tensor) override {
+    std::vector<Tensor> map(const Tensor& tensor) const override {
         std::vector<Tensor> tensors;
         tensors.reserve(num_devices_);
         std::fill_n(std::back_inserter(tensors), num_devices_, tensor);
@@ -37,7 +37,7 @@ class ShardTensorToMesh : public TensorToMesh {
 public:
     ShardTensorToMesh(size_t num_devices, int dim) : num_devices_(num_devices), shard_dim_(dim) {}
 
-    std::vector<Tensor> map(const Tensor& tensor) override {
+    std::vector<Tensor> map(const Tensor& tensor) const override {
         return experimental::xtensor::chunk(tensor, num_devices_, shard_dim_);
     }
 
@@ -55,7 +55,7 @@ class ShardTensorTo2dMesh : public TensorToMesh {
     ShardTensorTo2dMesh(const MeshShape& mesh_shape, const Shard2dConfig& config) :
         mesh_shape_(mesh_shape), config_(config) {}
 
-    std::vector<Tensor> map(const Tensor& tensor) override {
+    std::vector<Tensor> map(const Tensor& tensor) const override {
         const auto [rows, cols] = mesh_shape_;
         const auto [row_dim, col_dim] = config_;
 
@@ -111,7 +111,7 @@ class ConcatMeshToTensor : public MeshToTensor {
 public:
     ConcatMeshToTensor(int dim) : concat_dim_(dim) {}
 
-    Tensor compose(const std::vector<Tensor>& tensors) override {
+    Tensor compose(const std::vector<Tensor>& tensors) const override {
         return experimental::xtensor::concat(tensors, concat_dim_);
     }
 
@@ -124,7 +124,7 @@ class Concat2dMeshToTensor : public MeshToTensor {
     Concat2dMeshToTensor(MeshDevice& mesh_device, const Concat2dConfig& config) :
         mesh_shape_(mesh_device.shape()), config_(config) {}
 
-    Tensor compose(const std::vector<Tensor>& tensors) override {
+    Tensor compose(const std::vector<Tensor>& tensors) const override {
         const auto [rows, cols] = mesh_shape_;
         const auto [row_dim, col_dim] = config_;
 
@@ -180,7 +180,8 @@ std::unique_ptr<MeshToTensor> concat_2d_mesh_to_tensor_composer(MeshDevice& mesh
     return std::make_unique<Concat2dMeshToTensor>(mesh_device, config);
 }
 
-Tensor distribute_tensor(const Tensor& tensor, MeshDevice& mesh_device, TensorToMesh& mapper) {
+Tensor distribute_tensor(
+    const Tensor& tensor, const TensorToMesh& mapper, std::optional<std::reference_wrapper<MeshDevice>> mesh_device) {
     TT_FATAL(
         tensor.storage_type() != tt::tt_metal::StorageType::MULTI_DEVICE &&
             tensor.storage_type() != tt::tt_metal::StorageType::MULTI_DEVICE_HOST,
@@ -188,10 +189,13 @@ Tensor distribute_tensor(const Tensor& tensor, MeshDevice& mesh_device, TensorTo
         tensor.storage_type());
     std::vector<Tensor> tensors = mapper.map(tensor);
     Tensor output = aggregate_as_tensor(tensors, mapper.config());
-    return output.to(&mesh_device);
+    if (mesh_device.has_value()) {
+        return output.to(&(mesh_device->get()));
+    }
+    return output;
 }
 
-Tensor aggregate_tensor(const Tensor& tensor, MeshToTensor& composer) {
+Tensor aggregate_tensor(const Tensor& tensor, const MeshToTensor& composer) {
     return is_multi_device_tensor(tensor) ? composer.compose(get_tensors_from_multi_device_storage(tensor))
                                           : composer.compose({tensor});
 }
diff --git a/ttnn/cpp/ttnn/distributed/distributed_tensor.hpp b/ttnn/cpp/ttnn/distributed/distributed_tensor.hpp
index d8c8b060cf6..7d49ca932f4 100644
--- a/ttnn/cpp/ttnn/distributed/distributed_tensor.hpp
+++ b/ttnn/cpp/ttnn/distributed/distributed_tensor.hpp
@@ -13,7 +13,7 @@ namespace ttnn::distributed {
 class TensorToMesh {
 public:
     virtual ~TensorToMesh() = default;
-    virtual std::vector<Tensor> map(const Tensor& tensor) = 0;
+    virtual std::vector<Tensor> map(const Tensor& tensor) const = 0;
     virtual tt::tt_metal::DistributedTensorConfig config() const = 0;
 };
 
@@ -21,7 +21,7 @@ class TensorToMesh {
 class MeshToTensor {
 public:
     virtual ~MeshToTensor() = default;
-    virtual Tensor compose(const std::vector<Tensor>& tensors) = 0;
+    virtual Tensor compose(const std::vector<Tensor>& tensors) const = 0;
 };
 
 // Creates a mapper that replicates a tensor across all devices.
@@ -50,9 +50,12 @@ struct Concat2dConfig {
 std::unique_ptr<MeshToTensor> concat_2d_mesh_to_tensor_composer(MeshDevice& mesh_device, const Concat2dConfig& config);
 
 // Distributes a host tensor onto multi-device configuration according to the `mapper`.
-Tensor distribute_tensor(const Tensor& tensor, MeshDevice& mesh_device, TensorToMesh& mapper);
+Tensor distribute_tensor(
+    const Tensor& tensor,
+    const TensorToMesh& mapper,
+    std::optional<std::reference_wrapper<MeshDevice>> mesh_device = std::nullopt);
 
 // Aggregates a multi-device tensor into a host tensor according to the `composer`.
-Tensor aggregate_tensor(const Tensor& tensor, MeshToTensor& composer);
+Tensor aggregate_tensor(const Tensor& tensor, const MeshToTensor& composer);
 
 }  // namespace ttnn::distributed
diff --git a/ttnn/cpp/ttnn/distributed/distributed_tensor_config.cpp b/ttnn/cpp/ttnn/distributed/distributed_tensor_config.cpp
index 6e69a86b8be..7d8736fccc9 100644
--- a/ttnn/cpp/ttnn/distributed/distributed_tensor_config.cpp
+++ b/ttnn/cpp/ttnn/distributed/distributed_tensor_config.cpp
@@ -5,7 +5,7 @@
 #include <unordered_map>
 #include <string>
 
-#include "common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 #include "ttnn/distributed/distributed_tensor_config.hpp"
 
 namespace tt::tt_metal {
diff --git a/ttnn/cpp/ttnn/distributed/types.hpp b/ttnn/cpp/ttnn/distributed/types.hpp
index c8d8c4ac84d..be033b58fef 100644
--- a/ttnn/cpp/ttnn/distributed/types.hpp
+++ b/ttnn/cpp/ttnn/distributed/types.hpp
@@ -8,8 +8,8 @@
 // It imports and renames types from the tt_metal library to maintain a consistent naming convention
 // within the TTNN namespace while leveraging the underlying tt_metal functionality.
 
-#include "tt_metal/distributed/mesh_device.hpp"
-#include "tt_metal/distributed/system_mesh.hpp"
+#include <tt-metalium/mesh_device.hpp>
+#include <tt-metalium/system_mesh.hpp>
 namespace ttnn::distributed {
 
 using MeshShape = tt::tt_metal::distributed::MeshShape;
diff --git a/ttnn/cpp/ttnn/events.cpp b/ttnn/cpp/ttnn/events.cpp
index 53925658b87..90467f87a48 100644
--- a/ttnn/cpp/ttnn/events.cpp
+++ b/ttnn/cpp/ttnn/events.cpp
@@ -5,7 +5,7 @@
 #include "events.hpp"
 
 #include <memory>
-#include "tt_metal/impl/event/event.hpp"
+#include <tt-metalium/event.hpp>
 #include "ttnn/distributed/types.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/events.hpp b/ttnn/cpp/ttnn/events.hpp
index d1fdd353f34..cfec6dfa346 100644
--- a/ttnn/cpp/ttnn/events.hpp
+++ b/ttnn/cpp/ttnn/events.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <memory>
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/distributed/types.hpp"
 
 namespace ttnn::events {
diff --git a/ttnn/cpp/ttnn/global_circular_buffer.cpp b/ttnn/cpp/ttnn/global_circular_buffer.cpp
index a7789bf7b77..757f3feb350 100644
--- a/ttnn/cpp/ttnn/global_circular_buffer.cpp
+++ b/ttnn/cpp/ttnn/global_circular_buffer.cpp
@@ -5,8 +5,8 @@
 #include "global_circular_buffer.hpp"
 
 #include <memory>
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include <tt-metalium/global_circular_buffer.hpp>
 
 namespace ttnn::global_circular_buffer {
 
diff --git a/ttnn/cpp/ttnn/global_circular_buffer.hpp b/ttnn/cpp/ttnn/global_circular_buffer.hpp
index e0f44984efe..6235e4d9461 100644
--- a/ttnn/cpp/ttnn/global_circular_buffer.hpp
+++ b/ttnn/cpp/ttnn/global_circular_buffer.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <memory>
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <tt-metalium/global_circular_buffer.hpp>
 #include "ttnn/types.hpp"
 
 namespace ttnn::global_circular_buffer {
diff --git a/ttnn/cpp/ttnn/global_semaphore.cpp b/ttnn/cpp/ttnn/global_semaphore.cpp
index 2b4ccdab90a..67471dec341 100644
--- a/ttnn/cpp/ttnn/global_semaphore.cpp
+++ b/ttnn/cpp/ttnn/global_semaphore.cpp
@@ -5,9 +5,9 @@
 #include "global_semaphore.hpp"
 
 #include <memory>
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/global_semaphore.hpp>
+#include <tt-metalium/span.hpp>
 
 namespace ttnn::global_semaphore {
 
diff --git a/ttnn/cpp/ttnn/global_semaphore.hpp b/ttnn/cpp/ttnn/global_semaphore.hpp
index 38dec05ea64..046b77748b2 100644
--- a/ttnn/cpp/ttnn/global_semaphore.hpp
+++ b/ttnn/cpp/ttnn/global_semaphore.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <memory>
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/types.hpp"
 
 namespace ttnn::global_semaphore {
diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp
index b4fc114347e..c786f9790ba 100644
--- a/ttnn/cpp/ttnn/graph/graph_processor.cpp
+++ b/ttnn/cpp/ttnn/graph/graph_processor.cpp
@@ -4,10 +4,10 @@
 
 #include "graph_processor.hpp"
 #include "graph_consts.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/reflection.hpp>
 #include "ttnn/types.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
-#include "tt_metal/impl/program/program.hpp"
+#include <tt-metalium/circular_buffer.hpp>
+#include <tt-metalium/program_impl.hpp>
 #include "ttnn/graph/graph_consts.hpp"
 #include <cxxabi.h>
 #include <memory>
diff --git a/ttnn/cpp/ttnn/graph/graph_processor.hpp b/ttnn/cpp/ttnn/graph/graph_processor.hpp
index 788adde7253..0736ff8a9b0 100644
--- a/ttnn/cpp/ttnn/graph/graph_processor.hpp
+++ b/ttnn/cpp/ttnn/graph/graph_processor.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/graph/graph_tracking.hpp"
+#include <tt-metalium/graph_tracking.hpp>
 #include <nlohmann/json.hpp>
 #include "ttnn/tensor/tensor.hpp"
 
diff --git a/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp b/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp
index a77c08eb1ba..213dd346cd0 100644
--- a/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp
+++ b/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp
@@ -10,7 +10,7 @@
 
 #include "graph_consts.hpp"
 #include "graph_processor.hpp"
-#include "tt_metal/common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 
 namespace ttnn::graph {
 
diff --git a/ttnn/cpp/ttnn/operation.hpp b/ttnn/cpp/ttnn/operation.hpp
index c1b5aedeace..675953bc22f 100644
--- a/ttnn/cpp/ttnn/operation.hpp
+++ b/ttnn/cpp/ttnn/operation.hpp
@@ -7,10 +7,10 @@
 #include <experimental/type_traits>
 #include <ttnn/tensor/tensor.hpp>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/program/program.hpp"
-#include "tt_stl/concepts.hpp"
-#include "tt_stl/reflection.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/program_impl.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/reflection.hpp>
 #include "ttnn/config.hpp"
 
 namespace tt {
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
index 40d066ebfa7..9cad315547d 100644
--- a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #include "bernoulli_device_operation.hpp"
-#include "common/constants.hpp"
-#include "impl/kernels/kernel_types.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/kernel_types.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/tensor/types.hpp"
 
 namespace ttnn::operations::bernoulli {
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp
index dcebd6bef78..3f241941f41 100644
--- a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/writer_bernoulli.cpp
@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "dataflow_api.h"
 
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/cb_utils.hpp b/ttnn/cpp/ttnn/operations/cb_utils.hpp
index dac98ca8cc8..b731c87d8e2 100644
--- a/ttnn/cpp/ttnn/operations/cb_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/cb_utils.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather_pybind.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather_pybind.cpp
index b8404b749aa..a7c800d1c21 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/ccl/all_gather/all_gather.hpp"
 #include "ttnn/distributed/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp
index 7c28a33ee80..c66362a3099 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp
@@ -5,13 +5,13 @@
 #include "ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/experimental/hal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/hal_exp.hpp>
 
 #include "ttnn/tensor/tensor_utils.hpp"
 
-#include "ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp"
-#include "ttnn/cpp/ttnn/operations/copy.hpp"
+#include "cpp/ttnn/operations/data_movement/pad/pad.hpp"
+#include "cpp/ttnn/operations/copy.hpp"
 
 using namespace tt::tt_metal::experimental;
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
index 5e9527f473f..c550fbba736 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
@@ -5,12 +5,12 @@
 #pragma once
 
 #include <cstdint>
-#include "common/core_coord.hpp"
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
 #include "ttnn/operations/ccl/ccl_op_fusion.hpp"
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_reader.cpp
index da000117282..519c24f2ba1 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_reader.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_reader.cpp
@@ -4,8 +4,8 @@
 
 #include <cstdint>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
 
 void kernel_main() {
     constexpr uint32_t page_size = get_compile_time_arg_val(0);
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_writer.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_writer.cpp
index 8e40a7c1bc4..4be6e65ebe5 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_writer.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_writer.cpp
@@ -4,9 +4,9 @@
 
 #include <cstdint>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
 
 void kernel_main() {
     uint32_t arg_idx = 0;
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_reader.cpp
index fd5abe30646..c91fb54e05c 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_reader.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_reader.cpp
@@ -4,8 +4,8 @@
 
 #include <cstdint>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
 
 void kernel_main() {
     uint32_t arg_idx = 0;
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_writer.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_writer.cpp
index baa68a71033..3d2f93702cd 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_writer.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_writer.cpp
@@ -4,10 +4,10 @@
 
 #include <cstdint>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
 
 void kernel_main() {
     uint32_t arg_idx = 0;
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp
index 675de4fef24..66a36d92c82 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp
@@ -5,12 +5,12 @@
 
 #include "dataflow_api.h"
 #include "debug/assert.h"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/tensor/enum_types.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/tensor/enum_types.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
 
 using ttnn::ccl::ShardType;
 using ttnn::ccl::UNINITIALIZED_VALUE_U16;
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp
index c99bb3b322d..ad00f3b72c6 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp
@@ -4,19 +4,19 @@
 ///
 #include <algorithm>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
 
 #include <sstream>
 #include <type_traits>
diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/barrier.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/barrier.cpp
index 473127fab16..c84e04e1cca 100644
--- a/ttnn/cpp/ttnn/operations/ccl/barrier/barrier.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/barrier/barrier.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "barrier.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.hpp"
+#include "cpp/ttnn/operations/ccl/barrier/device/barrier_op.hpp"
 
 namespace ttnn::operations::ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/barrier.hpp b/ttnn/cpp/ttnn/operations/ccl/barrier/barrier.hpp
index ffd0fb81136..2f1fa9f4310 100644
--- a/ttnn/cpp/ttnn/operations/ccl/barrier/barrier.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/barrier/barrier.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_host_types.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_host_types.hpp"
 #include "ttnn/decorators.hpp"
 
 namespace ttnn {
diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/barrier_pybind.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/barrier_pybind.cpp
index 0dc6d9ce377..84fcf2b37e7 100644
--- a/ttnn/cpp/ttnn/operations/ccl/barrier/barrier_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/barrier/barrier_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/barrier/barrier.hpp"
 #include "ttnn/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.cpp
index 8b24ca7ebae..ca75b8646a6 100644
--- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/barrier_op.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/operations/ccl/barrier/device/barrier_op.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 #include <cstdint>
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp
index 062781a7de5..706971079a9 100644
--- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp
@@ -3,17 +3,17 @@
 // SPDX-License-Identifier: Apache-2.0
 ///
 
-#include "tt_metal/common/core_coord.hpp"
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/ccl/ccl_host_types.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/experimental/hal.hpp"
-#include "tt_metal/impl/buffers/circular_buffer_types.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/hal_exp.hpp>
+#include <tt-metalium/circular_buffer_types.hpp>
 
 #include "ttnn/operations/eltwise/binary/common/binary_op_types.hpp"
 #include "ttnn/operations/eltwise/binary/common/binary_op_utils.hpp"
diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_receiver.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_receiver.cpp
index d88f842a535..df4f31dfd56 100644
--- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_receiver.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_receiver.cpp
@@ -5,7 +5,7 @@
 #include "ethernet/dataflow_api.h"
 #include "tt_metal/hw/inc/ethernet/dataflow_api.h"
 #include <array>
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp"
 #define MIN_WAIT 100000
 
 FORCE_INLINE void perform_rs_loop(uint64_t channel_sem_addr, volatile eth_channel_sync_t* eth_channel_syncs) {
diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_sem_creator.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_sem_creator.cpp
index 002a10858f3..31f1a65f081 100644
--- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_sem_creator.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_sem_creator.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #include <cstdint>
-#include "tt_metal/hw/inc/dataflow_api.h"
+#include <tt-metalium/dataflow_api.h>
 #include "dataflow_api.h"
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_sender.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_sender.cpp
index 9f150bd56b4..cd4f848b320 100644
--- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_sender.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/kernels/barrier_sender.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/hw/inc/dataflow_api.h"
+#include <tt-metalium/dataflow_api.h>
 #include <array>
 #include "tt_metal/hw/inc/ethernet/dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp"
 #define MIN_WAIT 100000
 
 struct addr_sem_pair {
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
index 931c1429764..57857a68d44 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_common.hpp"
 
 #include <cstdint>
 #include <cmath>
 
 #include "ccl_host_datastructures.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp"
+#include "cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp"
 #include "ttnn/operations/data_movement/slice/slice.hpp"
 #include "ttnn/operations/data_movement/concat/concat.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp
index c72c8596b2b..01d320262c2 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp
@@ -7,15 +7,15 @@
 #include <cstdint>
 #include <numeric>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/common/types/ccl_types.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/program/program.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/program_impl.hpp>
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/operations/ccl/erisc_datamover_builder.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp"
 
 namespace ttnn {
 namespace ccl {
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp
index 9c73b15e930..d2037b1ebab 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/tensor/tensor_impl.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp"
+#include "cpp/ttnn/tensor/tensor_impl.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp"
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp
index 1cd5377edd6..d098ea5ff3a 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp
@@ -4,10 +4,10 @@
 
 #pragma once
 
-#include "tt_metal/experimental/hal.hpp"
-#include "ttnn/cpp/ttnn/tensor/tensor_impl.hpp"
+#include <tt-metalium/hal_exp.hpp>
+#include "cpp/ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_host_types.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_host_types.hpp"
 #include "ttnn/distributed/types.hpp"
 #include <limits>
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.cpp
index 282d6ce38d7..b80510e702e 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/program/program.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/program_impl.hpp>
 #include "ttnn/operations/ccl/ccl_op_fusion.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.hpp b/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.hpp
index a3ab99e6cc4..1626b0c0b21 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/program/program.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/program_impl.hpp>
 
 namespace ttnn {
 namespace experimental {
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_pybind.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_pybind.cpp
index cf8c6d8f800..a0470fd0185 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_pybind.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_pybind.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_pybind.hpp"
 
 #include "ttnn/operations/ccl/all_gather/all_gather_pybind.hpp"
 #include "ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.hpp"
 #include "ttnn/operations/ccl/barrier/barrier_pybind.hpp"
 
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp"
+#include "cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp"
 
 namespace ttnn::operations::ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.cpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.cpp
index 90ad6c9e5a8..3ae6b4530e5 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.cpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 ///
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp"
 
-#include "tt_metal/common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 
-#include "ttnn/cpp/ttnn/tensor/tensor.hpp"
+#include "cpp/ttnn/tensor/tensor.hpp"
 
 #include <ranges>
 #include <vector>
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp
index 1df0611252c..202f32b5193 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
 
 #include <vector>
 // #include <cstdint>
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp
index e756af576c7..a4514e8489f 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.cpp
@@ -6,17 +6,17 @@
 #include <iterator>
 
 #include "hostdevcommon/kernel_structs.h"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
 #include "ttnn/operations/ccl/erisc_datamover_builder.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp"
-#include "tt_metal/host_api.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_common.hpp"
+#include <tt-metalium/host_api.hpp>
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp"
-#include "tt_metal/tt_stl/overloaded.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp"
+#include <tt-metalium/overloaded.hpp>
 
 #include <optional>
 #include <variant>
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp
index 218cd9fe5a0..fbb58732b5f 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp
@@ -4,11 +4,11 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_common.hpp"
 #include "ttnn/operations/ccl/common/uops/ccl_command.hpp"
 #include "ttnn/operations/ccl/common/uops/ccl_host_commands.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.hpp"
 
 #include <cstdint>
 #include <optional>
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.cpp b/ttnn/cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.cpp
index 38b630fcd53..42859f09f3a 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.hpp"
 
-#include "tt_metal/impl/kernels/runtime_args_data.hpp"
-#include "common/assert.hpp"
+#include <tt-metalium/runtime_args_data.hpp>
+#include <tt-metalium/assert.hpp>
 
 namespace ttnn::ccl {
 size_t tensor_address_runtime_args_overrider::add_tensor() {
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/algorithms.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/algorithms.hpp
index 9f3e664640d..e6fe9905584 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/algorithms.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/algorithms.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
 #include <cstdint>
 
 inline size_t get_flat_index_from_shape(
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/io_descriptors.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/io_descriptors.hpp
index 9e34b165d49..2b433d53e13 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/io_descriptors.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/io_descriptors.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
 
 #include <cstdint>
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp
index 967171b5d85..827e5f6f649 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp
@@ -5,11 +5,11 @@
 #pragma once
 
 // CCL Kernel common includes
-#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/fabric_connection_manager.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/command_interpreter_base.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/ccl_command_base.hpp"
+#include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/fabric_connection_manager.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp"
+#include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/command_interpreter_base.hpp"
+#include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/ccl_command_base.hpp"
 
 // Metal includes
 #include "dataflow_api.h"
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp
index e801b8dea1d..c9a2ecb6559 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 
 #include "dataflow_api.h"
 #include <cstdint>
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send.cpp
index c1b0dd0e569..7abb1bc7bff 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send.cpp
@@ -3,17 +3,17 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
 #include "debug/dprint.h"
-#include "ttnn/cpp/ttnn/tensor/enum_types.hpp"
+#include "cpp/ttnn/tensor/enum_types.hpp"
 #include <cstdint>
 
 using ttnn::ccl::coord_t;
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp
index 80d165687fc..bb62676afbf 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp
@@ -3,18 +3,18 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
 #include "debug/dprint.h"
-#include "ttnn/cpp/ttnn/tensor/enum_types.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp"
+#include "cpp/ttnn/tensor/enum_types.hpp"
+#include "cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp"
+#include "cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp"
 
 #include <cstdint>
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp
index 547e09cb633..8fe14287998 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp
@@ -6,21 +6,21 @@
 //       that don't require macros to function
 
 #include "dataflow_api.h"
-#include "impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp"
+#include "cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/fabric_connection_manager.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/io_descriptors.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp"
-#include "ttnn/cpp/ttnn/tensor/enum_types.hpp"
+#include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/fabric_connection_manager.hpp"
+#include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/io_descriptors.hpp"
+#include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/noc_addr.hpp"
+#include "cpp/ttnn/tensor/enum_types.hpp"
 #include <cstdint>
 #include <utility>
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp
index 833ad9396f8..1017c837583 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
 
 #include "debug/dprint.h"
 #include <cstdint>
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp
index 8993147ac35..71865c224e5 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp
@@ -3,20 +3,20 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
-#include "ttnn/cpp/ttnn/tensor/enum_types.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp"
-
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
+#include "cpp/ttnn/tensor/enum_types.hpp"
+#include "cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp"
+#include "cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp"
+
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
 
 #include "debug/dprint.h"
 #include <cstdint>
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_wait_completion.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_wait_completion.cpp
index aabcec3eca8..c86dfb93521 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_wait_completion.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_wait_completion.cpp
@@ -7,7 +7,7 @@
 
 #include <cstdint>
 #include <array>
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
 
 #include "debug/dprint.h"
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp
index e37f4d1945d..ceb4cdf05cb 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/command_processor.hpp
@@ -3,18 +3,18 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
 
-#include "ttnn/cpp/ttnn/tensor/enum_types.hpp"
+#include "cpp/ttnn/tensor/enum_types.hpp"
 
 #include "dataflow_api.h"  // for interleaved addrgen
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/algorithms.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/algorithms.hpp"
 
 using shape_t = ttnn::ccl::Shape4D<uint32_t>;
 using ttnn::ccl::coord_t;
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp
index 477ee409988..2d22f36ff2c 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
-#include "impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/tensor/tensor.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
-#include "tt_metal/device.hpp"
+#include <tt-metalium/device.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp
index 222828f8e15..74c5db70167 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "common/core_coord.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
 
 #include <vector>
 #include <string>
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp
index 83187f3baeb..977fce4865d 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_device.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp"
 
 namespace ttnn {
 namespace ccl {
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.cpp b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.cpp
index f49dc4cdf67..164bec0691f 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
 
 namespace ttnn {
 namespace ccl {
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp
index 6ccfa7339b8..795d3e3af34 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp
@@ -11,9 +11,9 @@
 #include <variant>
 #include <limits>
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
 // For command dest type
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
 
 namespace ttnn {
 namespace ccl {
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp
index acdbc515d85..9b4282302f8 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command_device.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
 
 #ifdef DEBUG_PRINT_ENABLED
 #include "debug/dprint.h"
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.cpp b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.cpp
index b7e62b9a9ab..95c37a0fcfa 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp"
 
 #include "ttnn/operations/ccl/common/uops/ccl_command.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/tt_stl/overloaded.hpp"
+#include <tt-metalium/global_semaphore.hpp>
+#include <tt-metalium/overloaded.hpp>
 
 #include <variant>
 namespace ttnn::ccl::cmd {
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp
index ce92d8a2485..beb007dcf42 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
+#include <tt-metalium/global_semaphore.hpp>
 
 namespace ttnn::ccl::cmd {
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.cpp b/ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.cpp
index fcb36f035c5..91f0406c363 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/algorithms.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp"
+#include "cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/algorithms.hpp"
 #include "ttnn/operations/ccl/common/uops/ccl_command.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
 
 namespace ttnn::ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp b/ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp
index 30bbc152647..42636b684c1 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_host_commands.hpp"
 
 #include <vector>
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp
index 8a7cb3cc3f9..651d3972e14 100644
--- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp
@@ -2,23 +2,23 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp"
+#include "cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp"
 
-#include "common/math.hpp"
+#include <tt-metalium/math.hpp>
 #include "erisc_datamover_builder.hpp"
-#include "sub_device/sub_device_types.hpp"
-#include "tt_metal/common/assert.hpp"
+#include <tt-metalium/sub_device_types.hpp>
+#include <tt-metalium/assert.hpp>
 #include "ttnn/operations/ccl/ccl_common.hpp"
 #include "ttnn/operations/math.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/program/program.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/program_impl.hpp>
 
-#include "tt_metal/detail/tt_metal.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
-#include "tt_metal/experimental/hal.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include <tt-metalium/hal_exp.hpp>
 
 #include <iterator>
 #include <vector>
diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp
index 26a64bbec72..8af2ef13d43 100644
--- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp
@@ -10,13 +10,13 @@
 
 #include "ttnn/distributed/types.hpp"
 #include "umd/device/types/cluster_descriptor_types.h"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
 
-#include "tt_metal/device.hpp"
-#include "tt_metal/impl/program/program.hpp"
-#include "tt_metal/experimental/hal.hpp"
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/program_impl.hpp>
+#include <tt-metalium/hal_exp.hpp>
 
 #include <vector>
 #include <unordered_map>
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp b/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp
index 3ae218ca9e7..e966055fa2b 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp
@@ -7,8 +7,8 @@
 #include <type_traits>
 #include "dataflow_api.h"
 
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
 #include "tt_metal/hw/inc/ethernet/dataflow_api.h"
 
 namespace ccl {
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp
index f7db1ac813f..aeda90abdf3 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp
@@ -6,7 +6,7 @@
 
 #include "dataflow_api.h"
 #include "debug/assert.h"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 
 using ttnn::ccl::ShardType;
 using ttnn::ccl::WorkerXY;
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp
index 547c34c42f5..7004c35394b 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp
@@ -7,7 +7,7 @@
 #include "dataflow_api.h"
 #include "debug/assert.h"
 #include "debug/dprint.h"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 #include <array>
 
 // Called by the master worker to synchronize with the slave workers
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_async_datamover.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_async_datamover.hpp
index dfd3ce51624..a67e4509a7a 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_async_datamover.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_async_datamover.hpp
@@ -11,8 +11,8 @@
 #include "debug/assert.h"
 #include "eth_l1_address_map.h"
 #include "ethernet/dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp"
 
 using ttnn::ccl::EriscDataMoverBufferSharingMode;
 using ttnn::ccl::EriscDataMoverTerminationMode;
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_datamover.cpp
index 745e240ba64..15a44da3206 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_datamover.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_datamover.cpp
@@ -9,8 +9,8 @@
 #include "debug/dprint.h"
 #include "eth_l1_address_map.h"
 
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm/erisc_async_datamover.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm/erisc_async_datamover.hpp"
 
 // Args Schema:
 // 1) handshake addr
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp
index 43e30406cff..b2a1ceb6381 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp
@@ -7,8 +7,8 @@
 #include "dataflow_api.h"
 
 #include "tt_metal/hw/inc/ethernet/dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp"
 #include "debug/assert.h"
 #include "debug/dprint.h"
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp
index 9e2692d743a..831b38063af 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
 #include "debug/assert.h"
 
 namespace tt::fabric {
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp
index 41f3608559c..06ca6ee5153 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp
@@ -4,10 +4,10 @@
 
 #pragma once
 
-#include "tt_metal/hw/inc/dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp"
+#include <tt-metalium/dataflow_api.h>
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp"
 #include <cstdint>
 
 // If the hop/distance counter equals to the below value, it indicates that it has
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp
index 15120f16d4e..08e4912916a 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 #include <cstdint>
 
 namespace tt::fabric {
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp
index 5b0ce8f1bfb..b2977721fd1 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp
@@ -8,13 +8,13 @@
 
 #include "dataflow_api.h"
 #include "tt_metal/hw/inc/ethernet/dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 
 using ttnn::ccl::WorkerXY;
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp
index 58a509fa1fa..c0feb61bf71 100644
--- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp
@@ -9,12 +9,12 @@
 #include <cstdint>
 
 #include "debug/dprint.h"
-#include "tt_metal/hw/inc/dataflow_api.h"
+#include <tt-metalium/dataflow_api.h>
 #include "tt_metal/hw/inc/ethernet/tunneling.h"
-#include "tt_metal/hw/inc/risc_attribs.h"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include <tt-metalium/risc_attribs.h>
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp"
+#include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_types.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 
 namespace tt::fabric {
 // Increments val and wraps to 0 if it reaches limit
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
index 5e45abc3a2e..aa8252cf9ed 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
@@ -3,20 +3,20 @@
 // SPDX-License-Identifier: Apache-2.0
 ///
 
-#include "common/core_coord.hpp"
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/ccl/ccl_host_types.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/circular_buffer_types.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/circular_buffer_types.hpp>
 
 #include "ttnn/operations/eltwise/binary/common/binary_op_types.hpp"
 #include "ttnn/operations/eltwise/binary/common/binary_op_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.hpp"
+#include "cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.hpp"
 
 // Includes that need to be moved to CCL datastructures header
 #include <vector>
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_reader.cpp
index 2f51440a6a8..df1aa7201e2 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_reader.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_reader.cpp
@@ -8,13 +8,13 @@
 
 #include "dataflow_api.h"
 #include "debug/assert.h"
-#include "impl/buffers/buffer_constants.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include <tt-metalium/buffer_constants.hpp>
+#include "cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_adapters.hpp"
 
 using tt::tt_metal::TensorMemoryLayout;
 using ttnn::ccl::coord_t;
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_sender.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_sender.cpp
index e68870b0415..65350865e60 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_sender.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_sender.cpp
@@ -5,9 +5,9 @@
 #include <cstdint>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp"
 
 using ttnn::ccl::coord_t;
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
index ad489735a40..e992071c6e2 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 #include <cstdint>
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp
index 6bec01a1641..268f1afc375 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp"
+#include "cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_common.hpp"
 
 #include <cstdint>
 #include <cstddef>
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp
index 44a0db49af6..a08328e805e 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 #include <cstdint>
 #include <vector>
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.cpp
index b97ffac7752..4a521d92859 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.cpp
@@ -5,13 +5,13 @@
 #include <cstdint>
 #include <iterator>
 
-#include "ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.hpp"
+#include "cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.hpp"
 #include "hostdevcommon/kernel_structs.h"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp"
 
 namespace ttnn {
 namespace ccl {
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.hpp
index 5e1e699abe3..b38d421ad3c 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.hpp
@@ -4,9 +4,9 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp"
+#include "cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_common.hpp"
 
 #include <cstdint>
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp
index 4db32df0fb0..907557f84dc 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp
@@ -4,7 +4,7 @@
 
 #include "reduce_scatter.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp"
+#include "cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp"
 
 namespace ttnn::operations::ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp
index 4c9792d6767..9bb00b77918 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp
@@ -8,7 +8,7 @@
 
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_host_types.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_host_types.hpp"
 
 namespace ttnn {
 namespace operations {
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp
index e7f98604275..c6d10f5efa0 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp b/ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp
index 02416215d11..200ed172f81 100644
--- a/ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp
@@ -8,7 +8,7 @@
 // #include <type_traits>
 #include <limits>
 #include <vector>
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types.hpp"
 
 /*
  *    ------   ATTENTION  ATTENTION  ATTENTION  ATTENTION  ATTENTION   ------
diff --git a/ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp b/ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp
index 9df091f3ab3..3128e857418 100644
--- a/ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/shared_with_host/sharded_tensor_addr_gen.hpp
@@ -7,7 +7,7 @@
 #include <cstdint>
 #include <array>
 #include <bit>
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
+#include <tt-metalium/buffer_constants.hpp>
 
 /*
  *    ------   ATTENTION  ATTENTION  ATTENTION  ATTENTION  ATTENTION   ------
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
index 42a63502616..4575827bd07 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
@@ -5,7 +5,7 @@
 #include <optional>
 #include <utility>
 
-#include "tt_metal/impl/buffers/buffer_constants.hpp"
+#include <tt-metalium/buffer_constants.hpp>
 
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
index 9cbd02ddf2b..867872be80f 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/constants.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include <tt-metalium/constants.hpp>
+#include "cpp/pybind11/decorators.hpp"
 
 #include "conv2d_pybind.hpp"
-#include "ttnn/cpp/ttnn/operations/sliding_window/sliding_window_pybind.hpp"
+#include "cpp/ttnn/operations/sliding_window/sliding_window_pybind.hpp"
 #include "conv2d.hpp"
 #include "conv2d_utils.hpp"
 #include "prepare_conv2d_weights.hpp"
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
index 83882098f53..75c8860513c 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp
@@ -8,15 +8,15 @@
 #include <tuple>
 
 #include "conv2d_utils.hpp"
-#include "impl/buffers/buffer_constants.hpp"
+#include <tt-metalium/buffer_constants.hpp>
 #include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp"
 #include "ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 #include "ttnn/operations/sliding_window/sliding_window.hpp"
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/data_movement/move/move.hpp"
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
index 98030b15af2..c9292811663 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
@@ -6,13 +6,13 @@
 #include <cstdint>
 #include <utility>
 #include "conv2d_op.hpp"
-#include "common/math.hpp"
+#include <tt-metalium/math.hpp>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/constants.hpp>
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
index 4782a5352e0..c5f92b8b4b4 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
@@ -4,11 +4,11 @@
 
 #include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp"
 #include "ttnn/operations/sliding_window/sliding_window.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::constants;
 
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
index ee5f33c2cc8..ea2214ee506 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
@@ -5,11 +5,11 @@
 #include <cstdint>
 #include "ttnn/operations/conv/conv2d/device/conv2d_op.hpp"
 #include "ttnn/operations/sliding_window/sliding_window.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::constants;
 
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
index 72feb22670d..a4df5d54b18 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
@@ -5,7 +5,7 @@
 #include "ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/sliding_window/sliding_window.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/data_movement/pad/pad.hpp"
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp
index 16329691700..035833a42cc 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "conv_transpose2d_pybind.hpp"
 #include "conv_transpose2d.hpp"
diff --git a/ttnn/cpp/ttnn/operations/copy.hpp b/ttnn/cpp/ttnn/operations/copy.hpp
index adfb86272f6..750568b4c46 100644
--- a/ttnn/cpp/ttnn/operations/copy.hpp
+++ b/ttnn/cpp/ttnn/operations/copy.hpp
@@ -9,7 +9,7 @@
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
 #include "ttnn/operations/eltwise/unary/device/unary_device_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.hpp"
+#include "cpp/ttnn/operations/experimental/copy/typecast/typecast.hpp"
 
 namespace ttnn {
 namespace operations {
diff --git a/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp b/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp
index 03c72b4b711..24a0677cbd8 100644
--- a/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp
+++ b/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "compute_kernel_config.hpp"
 #include "ttnn/device.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp b/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp
index a48ef4eea92..9d6f6663af4 100644
--- a/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp
+++ b/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp
@@ -9,7 +9,7 @@
 #include <tuple>
 #include <optional>
 #include "umd/device/types/arch.h"
-#include "tt_metal/common/base_types.hpp"
+#include <tt-metalium/base_types.hpp>
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/operations/core/core.cpp b/ttnn/cpp/ttnn/operations/core/core.cpp
index c1c0baddc6d..08201e60773 100644
--- a/ttnn/cpp/ttnn/operations/core/core.cpp
+++ b/ttnn/cpp/ttnn/operations/core/core.cpp
@@ -6,11 +6,11 @@
 
 #include <utility>
 
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/impl/trace/trace.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/move/move.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
+#include <tt-metalium/command_queue.hpp>
+#include <tt-metalium/trace.hpp>
+#include "cpp/ttnn/operations/data_movement/move/move.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
 #include "ttnn/operations/data_movement/data_transfer/data_transfer.hpp"
 #include "ttnn/distributed/types.hpp"
 #include "ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.hpp"
diff --git a/ttnn/cpp/ttnn/operations/core/core.hpp b/ttnn/cpp/ttnn/operations/core/core.hpp
index ea7efe92756..5d0f875b430 100644
--- a/ttnn/cpp/ttnn/operations/core/core.hpp
+++ b/ttnn/cpp/ttnn/operations/core/core.hpp
@@ -13,7 +13,7 @@
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/types.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp
index c62c5cdf2c5..8ffeff85fd2 100644
--- a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp
+++ b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.cpp
@@ -11,7 +11,7 @@
 #include "ttnn/operations/data_movement/untilize/untilize.hpp"
 #include "ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp"
 #include "ttnn/operations/data_movement/reshape_view/reshape.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.hpp b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.hpp
index 5194364a9d9..981304e718c 100644
--- a/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.hpp
+++ b/ttnn/cpp/ttnn/operations/core/to_layout/to_layout_op.hpp
@@ -13,8 +13,8 @@
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/run_operation.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "ttnn/core.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/core/to_memory_config/to_memory_config_op.hpp b/ttnn/cpp/ttnn/operations/core/to_memory_config/to_memory_config_op.hpp
index be1ba18574d..1d54340fd63 100644
--- a/ttnn/cpp/ttnn/operations/core/to_memory_config/to_memory_config_op.hpp
+++ b/ttnn/cpp/ttnn/operations/core/to_memory_config/to_memory_config_op.hpp
@@ -8,11 +8,11 @@
 
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/run_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.hpp"
 #include "ttnn/types.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp"
+#include "cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp"
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp b/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp
index 2434877a0f4..da325263852 100644
--- a/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp
+++ b/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp
@@ -9,7 +9,7 @@
 #pragma once
 
 #include "ttnn/tensor/types.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/operations/creation.hpp b/ttnn/cpp/ttnn/operations/creation.hpp
index 431b1771e0a..ce4c86df93f 100644
--- a/ttnn/cpp/ttnn/operations/creation.hpp
+++ b/ttnn/cpp/ttnn/operations/creation.hpp
@@ -7,7 +7,7 @@
 #include <functional>
 #include <variant>
 
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/command_queue.hpp>
 #include "ttnn/common/constants.hpp"
 #include "ttnn/core.hpp"
 #include "ttnn/decorators.hpp"
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast_pybind.cpp
index 80995aa663c..10da63289d5 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/bcast_pybind.cpp
@@ -5,7 +5,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
 
 namespace ttnn::operations::data_movement::detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp
index fec015e3233..5252a602330 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 void kernel_main() {
     auto src0_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.cpp
index 01635c62ba2..06e3c99c883 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include "cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
 
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.cpp
index a07fc030977..66d27a841df 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include "cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.cpp
index 4896b309cf6..88e6dbdbddb 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include "cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp
index dd296251686..5dff92e3467 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp
@@ -2,15 +2,15 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/buffer.hpp>
 
-#include "ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include "cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt;
 using namespace constants;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.cpp
index 514576f297a..eca62811073 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include "cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp
index 0de80c507d6..95efedbb38f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "clone_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/math.hpp"
 
 namespace ttnn::operations::data_movement::clone {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp b/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp
index 1c30b6e9c76..30110ec3eb6 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/common/common.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/data_movement/common/common.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/squeeze/squeeze.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/common/common.hpp"
+#include "cpp/ttnn/operations/data_movement/pad/pad.hpp"
+#include "cpp/ttnn/operations/data_movement/squeeze/squeeze.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
 
 namespace ttnn {
 namespace operations {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/common/common.hpp b/ttnn/cpp/ttnn/operations/data_movement/common/common.hpp
index 731d1a798c2..5967289c1fd 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/common/common.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/common/common.hpp
@@ -5,8 +5,8 @@
 #include "ttnn/operations/data_movement/squeeze/squeeze.hpp"
 #include "ttnn/operations/data_movement/pad/pad.hpp"
 
-#include "ttnn/cpp/ttnn/tensor/types.hpp"
-#include "ttnn/cpp/ttnn/tensor/tensor.hpp"
+#include "cpp/ttnn/tensor/types.hpp"
+#include "cpp/ttnn/tensor/tensor.hpp"
 
 namespace ttnn {
 namespace operations {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp
index 4c580527336..dee6bfcdabb 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp
@@ -5,19 +5,19 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/operations/core/core.hpp"
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/math.hpp>
 
-#include "ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/common/common.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/slice/slice.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp"
+#include "cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.hpp"
+#include "cpp/ttnn/operations/data_movement/concat/concat.hpp"
+#include "cpp/ttnn/operations/data_movement/pad/pad.hpp"
+#include "cpp/ttnn/operations/data_movement/tilize/tilize.hpp"
+#include "cpp/ttnn/operations/data_movement/untilize/untilize.hpp"
+#include "cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.hpp"
+#include "cpp/ttnn/operations/data_movement/common/common.hpp"
+#include "cpp/ttnn/operations/data_movement/transpose/transpose.hpp"
+#include "cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp"
+#include "cpp/ttnn/operations/data_movement/slice/slice.hpp"
+#include "cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp"
 
 #include <ranges>
 #include <utility>
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat_pybind.hpp
index a9e8d9d5666..815a44a2f12 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "concat.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.cpp
index 6b2c651e25d..77ae55c6a03 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp"
+#include "cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.hpp"
+#include "cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp"
 
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 #include "ttnn/run_operation.hpp"
-#include "tt_metal/common/logger.hpp"
+#include <tt-metalium/logger.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
index 2e99d29eb9d..3083bfc738f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp"
+#include "cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp"
 
 #include <algorithm>
 #include <numeric>
 
-#include "ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.hpp"
+#include "cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp
index 3d7da6dd938..d6b883f5fd2 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.hpp
@@ -4,10 +4,10 @@
 
 #pragma once
 
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "ttnn/cpp/ttnn/operation.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include "cpp/ttnn/operation.hpp"
 
 #include "ttnn/operation.hpp"
 namespace ttnn::operations::data_movement::detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/copy_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/copy/copy_pybind.cpp
index 85ce5cd8134..1c06a6f557b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/copy/copy_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/copy/copy_pybind.cpp
@@ -6,7 +6,7 @@
 #include <pybind11/stl.h>
 
 #include "copy.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace {
 std::string get_binary_doc_string(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp
index bcf0e39e70f..50d9166c5d0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp
@@ -6,9 +6,9 @@
 
 #include <optional>
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 #include "ttnn/run_operation.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp
index 9084d685056..e1d4afb968b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp
@@ -2,17 +2,17 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "copy_device_operation.hpp"
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include <algorithm>
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/data_movement_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/data_movement_pybind.hpp
index af20dc2aa51..7db5d0786ec 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/data_movement_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/data_movement_pybind.hpp
@@ -7,10 +7,10 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.hpp"
+#include "cpp/pybind11/decorators.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast_pybind.hpp"
 #include "ttnn/operations/data_movement/clone/clone_pybind.hpp"
 #include "ttnn/operations/data_movement/concat/concat_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/data_movement/expand/device/expand_rm_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/expand/device/expand_rm_program_factory.cpp
index a7ed3835042..0bf862f5bc6 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/expand/device/expand_rm_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/expand/device/expand_rm_program_factory.cpp
@@ -6,14 +6,14 @@
 #include <string>
 #include <vector>
 
-#include "common/tt_backend_api_types.hpp"
-#include "common/work_split.hpp"
+#include <tt-metalium/tt_backend_api_types.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "expand_device_operation.hpp"
-#include "host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "hostdevcommon/kernel_structs.h"
-#include "impl/buffers/buffer.hpp"
-#include "impl/buffers/circular_buffer_types.hpp"
-#include "impl/kernels/kernel_types.hpp"
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/circular_buffer_types.hpp>
+#include <tt-metalium/kernel_types.hpp>
 #include "ttnn/tensor/types.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/device/fill_rm_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/device/fill_rm_op.cpp
index 52091fa18cd..8f79b41b0a3 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/device/fill_rm_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/device/fill_rm_op.cpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/operations/data_movement/fill_rm/device/fill_rm_op.hpp"
-#include "tt_metal/common/test_tiles.hpp"
+#include <tt-metalium/test_tiles.hpp>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm_pybind.cpp
index cfbec33673d..ac0062c5490 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm_pybind.cpp
@@ -7,7 +7,7 @@
 
 #include "fill_rm_pybind.hpp"
 #include "fill_rm.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::data_movement {
 namespace detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp
index 16206205a67..f51062f5ba8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fold/fold.cpp
@@ -9,10 +9,10 @@
 #include "ttnn/operations/math.hpp"
 #include "ttnn/operations/data_movement/transpose/transpose.hpp"
 #include "ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/slice/slice.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp"
-#include "tt_metal/common/constants.hpp"
+#include "cpp/ttnn/operations/data_movement/slice/slice.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/pad/pad.hpp"
+#include <tt-metalium/constants.hpp>
 
 #include "fold.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/fold_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/fold_pybind.cpp
index 0e1bb036c00..fbb5d4bd68d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fold/fold_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fold/fold_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/types.hpp"
 
 namespace ttnn::operations::data_movement {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op_multi_core_program_factory.cpp
index 94ed0dfa28d..55adaa3e0aa 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op_multi_core_program_factory.cpp
@@ -5,11 +5,11 @@
 #include <algorithm>
 
 #include "ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill_pybind.cpp
index e320341388b..7c8fddc0475 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill_pybind.cpp
@@ -7,7 +7,7 @@
 
 #include "indexed_fill_pybind.hpp"
 #include "indexed_fill.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::data_movement {
 namespace detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.cpp
index b9d6a382455..22a1738ac50 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.cpp
@@ -4,7 +4,7 @@
 
 #include "move_device_operation.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp
index 95abfb48f93..ecb509280a0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp
@@ -6,9 +6,9 @@
 
 #include <optional>
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operation.hpp"
 
 namespace ttnn::operations::data_movement {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_program_factory.cpp
index 919d412ff32..3ae2136c29c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_program_factory.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
+#include "cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp"
 #include "ttnn/operations/math.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp"
+#include "cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include <algorithm>
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/move_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/move/move_pybind.cpp
index b4aae5a939d..10fce7841db 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/move/move_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/move/move_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "move.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_op.hpp
index 0c350da1582..b33f8c019d3 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_op.hpp
@@ -6,9 +6,9 @@
 
 #include <optional>
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 #include "ttnn/run_operation.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_program_factory.cpp
index 7aeb1699074..8f203644b94 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_program_factory.cpp
@@ -5,12 +5,12 @@
 #include <algorithm>
 
 #include "non_zero_indices_op.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices_pybind.cpp
index 66ccad66778..afc539ab4b2 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices_pybind.cpp
@@ -7,7 +7,7 @@
 
 #include "non_zero_indices_pybind.hpp"
 #include "non_zero_indices.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::data_movement {
 namespace detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_op.cpp
index ce6fae12474..fdc7874973d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_op.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "pad_op.hpp"
 #include "pad_program_factory.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
index 3c6817ba62f..db8aa909ebd 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
@@ -3,13 +3,13 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/tensor/host_buffer/functions.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_log.h"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_log.h>
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/data_movement/common/common.hpp"
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.hpp
index 15f7778fd0b..8c34f93c36b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.hpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::data_movement::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp
index 9fa3dc63474..f229e5fbd33 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "pad.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_tiled_invariant.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_tiled_invariant.cpp
index 2d8f69d4695..ca9fa46fd36 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_tiled_invariant.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_tiled_invariant.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm_blocked_generic.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm_blocked_generic.cpp
index 5af2edb379f..d80300ab71c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm_blocked_generic.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm_blocked_generic.cpp
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
+#include "cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
 
 void kernel_main() {
     // Compile-time constants
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_tiled_row_invariant.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_tiled_row_invariant.cpp
new file mode 100644
index 00000000000..f111b32121b
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_tiled_row_invariant.cpp
@@ -0,0 +1,335 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+#include "cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
+
+// ------------------------------------------------------------------
+// 1) unflatten_index<RANK>:
+//    Unflatten 'flat_idx' in row-major order for a shape[] of length RANK.
+//    shape[d] is also uint32_t. We store the result into out_multi_idx[].
+template <uint32_t RANK>
+inline void unflatten_index(uint32_t flat_idx, const uint32_t (&shape)[RANK], uint32_t (&out_multi_idx)[RANK]) {
+    // Process from last dimension to first, in row-major unflattening.
+    for (int d = RANK - 1; d >= 0; d--) {
+        uint32_t dim_size = shape[d];
+        out_multi_idx[d] = flat_idx % dim_size;
+        flat_idx /= dim_size;
+    }
+}
+
+// ------------------------------------------------------------------
+// 2) flatten_index_ignore_last_dim<RANK>:
+//    Flatten all RANK dims in row-major order except ignoring dimension RANK-1.
+template <uint32_t RANK>
+inline uint32_t flatten_index_ignore_last_dim(const uint32_t (&multi_idx)[RANK], const uint32_t (&shape)[RANK]) {
+    uint32_t offset = 0;
+    for (uint32_t d = 0; d < RANK - 1; d++) {
+        offset = offset * shape[d] + multi_idx[d];
+    }
+    return offset;
+}
+
+template <uint32_t RANK, uint32_t TILE_HEIGHT, uint32_t TILE_WIDTH>
+inline uint32_t get_unpadded_linear_row_index_for_tile(
+    uint32_t tile,
+    const uint32_t (&input_tiled_shape)[RANK],  // [ ..., output_H_tiled, W_t ]
+    const uint32_t (&input_shape)[RANK],        // [ ..., output_H,   W   ]
+    uint32_t (&src_multi_idx)[RANK]) {
+    // 1) Unflatten 'tile' => src_multi_idx in the tiled shape
+    unflatten_index<RANK>(tile, input_tiled_shape, src_multi_idx);
+
+    // 2) Multiply the output_H-dim by TILE_HEIGHT
+    src_multi_idx[RANK - 2] *= TILE_HEIGHT;
+
+    // 3) Flatten ignoring last dim => linear row offset
+    return flatten_index_ignore_last_dim<RANK>(src_multi_idx, input_shape);
+}
+
+void kernel_main() {
+    // ------------------------------------------------------------------------
+    // 0) Read compile-time constants
+    // ------------------------------------------------------------------------
+    constexpr bool dst_is_dram = (get_compile_time_arg_val(0) == 1);
+    constexpr uint32_t element_size = get_compile_time_arg_val(1);
+    constexpr uint32_t cb_id_out0 = get_compile_time_arg_val(2);
+    constexpr uint32_t output_H = get_compile_time_arg_val(3);
+    constexpr uint32_t H = get_compile_time_arg_val(4);
+    constexpr uint32_t W = get_compile_time_arg_val(5);
+    constexpr uint32_t TILE_HEIGHT = get_compile_time_arg_val(6);
+    constexpr uint32_t TILE_WIDTH = get_compile_time_arg_val(7);
+    constexpr uint32_t FACE_HEIGHT = get_compile_time_arg_val(8);
+    constexpr uint32_t FACE_WIDTH = get_compile_time_arg_val(9);
+    constexpr bool needs_padding = (get_compile_time_arg_val(10) == 1);
+    constexpr uint32_t RANK = get_compile_time_arg_val(11);
+    constexpr uint32_t permuted_input_h_index = get_compile_time_arg_val(12);
+
+    // ------------------------------------------------------------------------
+    // 1) Read runtime arguments
+    // ------------------------------------------------------------------------
+    uint32_t dst_addr = get_arg_val<uint32_t>(0);
+    uint32_t start_tile = get_arg_val<uint32_t>(1);
+    uint32_t end_tile = get_arg_val<uint32_t>(2);
+    uint32_t start_padding_tile_idx = get_arg_val<uint32_t>(3);
+    uint32_t end_padding_tile_idx = get_arg_val<uint32_t>(4);
+
+    // input_shape, perm, output_shape
+    uint32_t array_start_offset = 5;  // input shape starts at arg #5
+    uint32_t input_shape[RANK], perm[RANK], output_shape[RANK];
+    for (uint32_t i = 0; i < RANK; i++) {
+        input_shape[i] = get_arg_val<uint32_t>(i + array_start_offset);
+        perm[i] = get_arg_val<uint32_t>(i + array_start_offset + RANK);
+    }
+    for (uint32_t i = 0; i < RANK; i++) {
+        output_shape[i] = input_shape[perm[i]];
+    }
+
+    // ------------------------------------------------------------------------
+    // 2) Derived compile-time constants
+    // ------------------------------------------------------------------------
+    constexpr uint32_t TILE_HW = TILE_HEIGHT * TILE_WIDTH;
+    constexpr uint8_t NUM_FACES_H = TILE_HEIGHT / FACE_HEIGHT;
+    constexpr uint8_t NUM_FACES_W = TILE_WIDTH / FACE_WIDTH;
+
+    // Padded dims
+    constexpr uint32_t output_H_padded = tt::data_movement::common::round_up<output_H, TILE_HEIGHT>();
+    constexpr uint32_t H_p = tt::data_movement::common::round_up<H, TILE_HEIGHT>();
+    constexpr uint32_t W_p = tt::data_movement::common::round_up<W, TILE_WIDTH>();
+
+    // Tiled dims
+    constexpr uint32_t W_t = W_p / TILE_WIDTH;
+    constexpr uint32_t H_t = H_p / TILE_HEIGHT;
+    constexpr uint32_t output_H_tiled = output_H_padded / TILE_HEIGHT;
+
+    // For sub-tile writes
+    constexpr uint32_t SUBTILE_LINE_BYTES = FACE_WIDTH * element_size;
+
+    // Address generator
+    const uint32_t tile_bytes = get_tile_size(cb_id_out0);
+    const auto input_data_format = get_dataformat(cb_id_out0);
+
+    const InterleavedAddrGenFast<dst_is_dram, TILE_HW> s = {
+        .bank_base_address = dst_addr, .page_size = tile_bytes, .data_format = input_data_format};
+
+    // ------------------------------------------------------------------------
+    // 3) Height dimension remainder logic
+    // ------------------------------------------------------------------------
+    constexpr uint32_t H_last_tile = H - (H_t - 1) * TILE_HEIGHT;
+    uint8_t remainder_faces_h = tt::data_movement::common::div_up<H_last_tile, FACE_HEIGHT>();
+
+    uint32_t remainder = H_last_tile % FACE_HEIGHT;
+    uint8_t sub_tile_lines_real =
+        (remainder == 0) ? static_cast<uint8_t>(FACE_HEIGHT) : static_cast<uint8_t>(remainder);
+
+    // Some precomputed constants
+    constexpr uint32_t face_height_width = FACE_HEIGHT * FACE_WIDTH;
+    constexpr uint32_t face_h_stride = NUM_FACES_W * face_height_width;
+    constexpr uint32_t face_height_width_bytes = face_height_width * element_size;
+    constexpr uint32_t face_h_stride_bytes = face_h_stride * element_size;
+
+    // ------------------------------------------------------------------------
+    // 4) Build padded and tiled shapes
+    // ------------------------------------------------------------------------
+    uint32_t input_padded_shape[RANK];
+    uint32_t input_tiled_shape[RANK];
+    for (uint32_t i = 0; i < RANK; i++) {
+        if (i < RANK - 2) {
+            input_padded_shape[i] = input_shape[i];
+            input_tiled_shape[i] = input_shape[i];
+        } else if (i == RANK - 2) {
+            input_padded_shape[i] = H_p;
+            input_tiled_shape[i] = H_t;
+        } else {
+            // i == RANK - 1
+            input_padded_shape[i] = W_p;
+            input_tiled_shape[i] = W_t;
+        }
+    }
+
+    uint32_t output_padded_shape[RANK];
+    uint32_t output_tiled_shape[RANK];
+    for (uint32_t i = 0; i < RANK; i++) {
+        if (i < RANK - 2) {
+            output_padded_shape[i] = output_shape[i];
+            output_tiled_shape[i] = output_shape[i];
+        } else if (i == RANK - 2) {
+            output_padded_shape[i] = output_H_padded;
+            output_tiled_shape[i] = output_H_tiled;
+        } else {
+            // i == RANK - 1
+            output_padded_shape[i] = W_p;
+            output_tiled_shape[i] = W_t;
+        }
+    }
+
+    // ------------------------------------------------------------------------
+    // 5) Build row strides for the destination padded shape
+    // ------------------------------------------------------------------------
+    uint32_t dest_padded_strides[RANK];
+    dest_padded_strides[RANK - 1] = 1;
+    dest_padded_strides[RANK - 2] = 1;  // dimension output_H in output
+    for (int i = RANK - 3; i >= 0; i--) {
+        dest_padded_strides[i] = dest_padded_strides[i + 1] * output_padded_shape[i + 1];
+    }
+
+    // ------------------------------------------------------------------------
+    // 6) Main loop over all tiles [start_tile..end_tile)
+    // ------------------------------------------------------------------------
+    uint32_t src_multi_idx[RANK];
+    uint32_t dest_multi_idx[RANK];
+
+    for (uint32_t tile = start_tile; tile < end_tile; ++tile) {
+        // 6a) Unflatten 'tile' => src_multi_idx in input_tiled_shape
+        unflatten_index<RANK>(tile, input_tiled_shape, src_multi_idx);
+
+        uint32_t w_t_local = src_multi_idx[RANK - 1];  // tile index in W
+        uint32_t h_t_local = src_multi_idx[RANK - 2];  // tile index in H
+
+        // Determine how many faces in height have valid data
+        uint8_t num_faces_h = (h_t_local == (H_t - 1)) ? remainder_faces_h : NUM_FACES_H;
+
+        // Convert that tile's row dimension from tile index => row offset
+        src_multi_idx[RANK - 2] *= TILE_HEIGHT;
+
+        // Flatten => tile_start (the linear row offset in the unpadded shape)
+        uint32_t tile_start = flatten_index_ignore_last_dim<RANK>(src_multi_idx, input_shape);
+
+        // 6b) Build dest_multi_idx by permutation
+        //     (the row dimension index for output tensor is also from src_multi_idx)
+        for (uint32_t i = 0; i < RANK; ++i) {
+            dest_multi_idx[i] = src_multi_idx[perm[i]];
+        }
+
+        // 6c) Compute base_output_row_offset ignoring the dimension permuted_input_h_index
+        uint32_t base_output_row_offset = 0;
+        for (uint32_t i = 0; i < RANK - 1; i++) {
+            if (i == permuted_input_h_index) {
+                continue;
+            }
+            base_output_row_offset += dest_multi_idx[i] * dest_padded_strides[i];
+        }
+
+        // This kernel is specialized so that permuted_input_h_index != RANK-2
+        // => we can use RANK-2 for the tile dimension directly.
+        uint32_t base_output_row_tile_start = dest_multi_idx[RANK - 2] % TILE_HEIGHT;
+
+        // Face index in tile's height dimension
+        uint32_t base_output_row_face_start = base_output_row_tile_start / FACE_HEIGHT;
+        // Row within that face
+        uint32_t output_sub_tile_line = base_output_row_tile_start % FACE_HEIGHT;
+
+        // Precompute once for the tile:
+        uint32_t output_face_line_offset =
+            base_output_row_face_start * face_h_stride + output_sub_tile_line * FACE_WIDTH;
+        // Also factor out the multiply by element_size once:
+        uint32_t base_output_face_line_offset_bytes = output_face_line_offset * element_size;
+
+        // 6d) Wait for data block
+        cb_wait_front(cb_id_out0, 1);
+        uint32_t base_l1_read_addr = get_read_ptr(cb_id_out0);
+
+        // 6e) Loop over faces in the height dimension
+        for (uint8_t face_h = 0; face_h < num_faces_h; ++face_h) {
+            bool is_last_sub_tile_line = ((h_t_local == (H_t - 1)) && (face_h == num_faces_h - 1));
+            uint8_t sub_tile_lines = is_last_sub_tile_line ? sub_tile_lines_real : FACE_HEIGHT;
+
+            // row offset for the start of this face
+            uint32_t base_row = tile_start + (face_h * FACE_HEIGHT);
+            // also the offset in bytes for reading from L1
+            uint32_t face_h_offset_bytes = face_h * face_h_stride_bytes;
+
+            // 6f) For each line within that face
+            for (uint8_t sub_tile_line = 0; sub_tile_line < sub_tile_lines; ++sub_tile_line) {
+                // Compute the logical row
+                uint32_t row = base_row + sub_tile_line;
+
+                // Update src_multi_idx / dest_multi_idx for the row dimension only
+                src_multi_idx[RANK - 2] = row % input_shape[RANK - 2];
+                dest_multi_idx[permuted_input_h_index] = src_multi_idx[RANK - 2];
+
+                // Flatten that row dimension into the output offset
+                uint32_t output_row_offset = base_output_row_offset + dest_multi_idx[permuted_input_h_index] *
+                                                                          dest_padded_strides[permuted_input_h_index];
+
+                uint32_t output_tile_idx = (output_row_offset / TILE_HEIGHT) * W_t + w_t_local;
+
+                // 6g) Loop over faces in the width dimension
+                for (uint8_t face_w = 0; face_w < NUM_FACES_W; ++face_w) {
+                    // face_w offset in bytes
+                    uint32_t face_w_offset_bytes = face_w * face_height_width_bytes;
+
+                    // Where data goes in the *output* tile
+                    uint32_t output_tile_offset_bytes = base_output_face_line_offset_bytes + face_w_offset_bytes;
+
+                    // Build final output address
+                    uint64_t write_noc_base_addr = get_noc_addr(output_tile_idx, s, output_tile_offset_bytes);
+
+                    // Build final input read address
+                    uint32_t final_addr = base_l1_read_addr + face_h_offset_bytes + face_w_offset_bytes +
+                                          (sub_tile_line * SUBTILE_LINE_BYTES);
+
+                    // 6h) Asynchronous write
+                    noc_async_write(final_addr, write_noc_base_addr, SUBTILE_LINE_BYTES);
+                }
+            }
+        }
+        noc_async_write_barrier();
+        cb_pop_front(cb_id_out0, 1);
+    }
+
+    // ------------------------------------------------------------------------
+    // 7) Handle padding if needed
+    // ------------------------------------------------------------------------
+    if constexpr (needs_padding) {
+        cb_wait_front(tt::CBIndex::c_1, 1);
+        uint32_t l1_read_ptr = get_read_ptr(tt::CBIndex::c_1);
+
+        // We'll reuse 'dest_multi_idx' for tile indexing
+        constexpr uint32_t x_t = output_H_tiled - 1;
+        constexpr uint8_t X_in_tile = output_H % TILE_HEIGHT;
+        constexpr uint8_t face_c_start = (X_in_tile / FACE_HEIGHT);
+
+        dest_multi_idx[RANK - 2] = x_t;  // fix the tile dimension in output
+
+        for (uint32_t tile_idx = start_padding_tile_idx; tile_idx < end_padding_tile_idx; ++tile_idx) {
+            // Unflatten 'tile_idx' => dest_multi_idx in the output tiled shape
+            size_t remaining = tile_idx;
+            for (uint32_t i = 0; i < RANK; ++i) {
+                size_t dim = RANK - 1 - i;
+                if (dim == (RANK - 2)) {
+                    continue;  // already set x_t
+                }
+                dest_multi_idx[dim] = (remaining % output_tiled_shape[dim]);
+                remaining /= output_tiled_shape[dim];
+            }
+
+            // Flatten => linear_idx
+            uint32_t linear_idx = 0;
+            for (uint32_t i = 0; i < RANK; ++i) {
+                linear_idx = (linear_idx * output_tiled_shape[i]) + dest_multi_idx[i];
+            }
+
+            // Write out padding lines
+            for (uint8_t face_c = face_c_start; face_c < NUM_FACES_H; ++face_c) {
+                uint32_t face_c_offset = face_c * NUM_FACES_W * face_height_width;
+                uint8_t sub_tile_line_start = (face_c == face_c_start) ? (X_in_tile % FACE_HEIGHT) : 0;
+
+                for (uint8_t face_w = 0; face_w < NUM_FACES_W; ++face_w) {
+                    uint32_t face_offset = face_c_offset + (face_w * face_height_width);
+
+                    for (uint8_t sub_tile_line = sub_tile_line_start; sub_tile_line < FACE_HEIGHT; ++sub_tile_line) {
+                        uint32_t offset = (face_offset + (sub_tile_line * FACE_WIDTH)) * element_size;
+
+                        uint64_t write_noc_base_addr = get_noc_addr(linear_idx, s, offset);
+
+                        // Perform asynchronous write
+                        noc_async_write(l1_read_ptr, write_noc_base_addr, SUBTILE_LINE_BYTES);
+                    }
+                }
+            }
+        }
+        noc_async_write_barrier();
+        cb_pop_front(tt::CBIndex::c_1, 1);
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp
index 0a2301030d0..6999e658509 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp
@@ -1,12 +1,12 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cstdint>
 #include <utility>
 
-#include "ttnn/cpp/ttnn/tensor/types.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
+#include "cpp/ttnn/tensor/types.hpp"
+#include "cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
 
 namespace ttnn::operations::data_movement {
 
@@ -26,6 +26,8 @@ PermuteDeviceOperation::program_factory_t PermuteDeviceOperation::select_program
         if ((dims[rank - 1] == rank - 1 && dims[rank - 2] == rank - 2) ||
             (dims[rank - 1] == rank - 2 && dims[rank - 2] == rank - 1)) {
             return MultiCoreTileInvariant{};
+        } else if (dims[rank - 1] == rank - 1) {
+            return MultiCoreTileRowInvariant{};
         }
     }
     return MultiCoreBlockedGeneric{};
@@ -42,8 +44,7 @@ void PermuteDeviceOperation::validate_on_program_cache_miss(
     TT_FATAL(
         tensor_args.input_tensor.get_layout() == Layout::ROW_MAJOR ||
             (tensor_args.input_tensor.get_layout() == Layout::TILE &&
-             ((dims[rank - 1] == rank - 1 && dims[rank - 2] == rank - 2) ||
-              (dims[rank - 1] == rank - 2 && dims[rank - 2] == rank - 1))),
+             ((dims[rank - 1] == rank - 1) || (dims[rank - 1] == rank - 2 && dims[rank - 2] == rank - 1))),
         "Permute operation only supports row-major layout");
 }
 
@@ -83,9 +84,13 @@ PermuteDeviceOperation::invoke(
     const Tensor& input_tensor,
     const SmallVector<uint32_t>& dims,
     const std::optional<MemoryConfig>& memory_config,
-    std::optional<Tensor> optional_output_tensor) {
+    std::optional<Tensor> optional_output_tensor,
+    const std::optional<float>& pad_value) {
     return {
-        operation_attributes_t{.dims = dims, .output_mem_config = memory_config.value_or(input_tensor.memory_config())},
+        operation_attributes_t{
+            .dims = dims,
+            .output_mem_config = memory_config.value_or(input_tensor.memory_config()),
+            .pad_value = pad_value},
         tensor_args_t{.input_tensor = input_tensor, .optional_output_tensor = std::move(optional_output_tensor)}};
 }
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp
index dd255d3f380..8bbd1284dc5 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -12,7 +12,7 @@
 #include "ttnn/device_operation.hpp"
 #include "ttnn/types.hpp"
 #include "ttnn/decorators.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include <tt-metalium/span.hpp>
 
 namespace ttnn::operations::data_movement {
 
@@ -20,6 +20,7 @@ struct PermuteDeviceOperation {
     struct operation_attributes_t {
         const SmallVector<uint32_t> dims;
         const MemoryConfig output_mem_config;
+        const std::optional<float> pad_value;
     };
     struct tensor_args_t {
         const Tensor& input_tensor;
@@ -72,6 +73,9 @@ struct PermuteDeviceOperation {
             const tensor_args_t& tensor_args,
             tensor_return_value_t& tensor_return_value);
     };
+
+    // Implementation for when the tile is not broken apart (either dims = {..., rank - 2, rank - 1} or {..., rank - 1,
+    // rank - 2})
     struct MultiCoreTileInvariant {
         // Shared variables are the variables that are shared between the create and override_runtime_arguments methods
         struct shared_variables_t {
@@ -94,7 +98,31 @@ struct PermuteDeviceOperation {
             tensor_return_value_t& tensor_return_value);
     };
 
-    using program_factory_t = std::variant<MultiCoreRowInvariant, MultiCoreBlockedGeneric, MultiCoreTileInvariant>;
+    // Implemention for when the height dimension (rank - 2) is swapped with another dimension (dims = {..., rank - 2,
+    // ..., i, rank - 1})
+    struct MultiCoreTileRowInvariant {
+        // Shared variables are the variables that are shared between the create and override_runtime_arguments methods
+        struct shared_variables_t {
+            tt::tt_metal::KernelHandle unary_reader_kernel_id;
+            tt::tt_metal::KernelHandle unary_writer_kernel_id;
+            CoreRangeSet core_range;
+        };
+        using cached_program_t = ttnn::device_operation::CachedProgram<shared_variables_t>;
+
+        static cached_program_t create(
+            const operation_attributes_t& operation_attributes,
+            const tensor_args_t& tensor_args,
+            tensor_return_value_t& tensor_return_value);
+
+        static void override_runtime_arguments(
+            cached_program_t& cached_program,
+            const operation_attributes_t& operation_attributes,
+            const tensor_args_t& tensor_args,
+            tensor_return_value_t& tensor_return_value);
+    };
+
+    using program_factory_t =
+        std::variant<MultiCoreRowInvariant, MultiCoreBlockedGeneric, MultiCoreTileInvariant, MultiCoreTileRowInvariant>;
 
     // Mandatory methods
 
@@ -123,7 +151,8 @@ struct PermuteDeviceOperation {
         const Tensor& input_tensor,
         const SmallVector<uint32_t>& dims,
         const std::optional<MemoryConfig>& memory_config,
-        std::optional<Tensor> optional_output_tensor);
+        std::optional<Tensor> optional_output_tensor,
+        const std::optional<float>& pad_value = std::nullopt);
 };
 }  // namespace ttnn::operations::data_movement
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_rm_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_rm_program_factory.cpp
index 571576397a4..1ec9befaf94 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_rm_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_rm_program_factory.cpp
@@ -1,10 +1,10 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/experimental/hal.hpp"
+#include "cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/hal_exp.hpp>
 
 using namespace tt::tt_metal::experimental;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_tiled_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_tiled_program_factory.cpp
index 16deedc7d62..ba6397bac42 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_tiled_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_tiled_program_factory.cpp
@@ -1,9 +1,9 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include "cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
 #include <vector>
 
 namespace ttnn::operations::data_movement {
@@ -102,8 +102,8 @@ PermuteDeviceOperation::MultiCoreTileInvariant::cached_program_t PermuteDeviceOp
     uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_pages_to_read = 2;
 
-    uint32_t N = operation_attributes.dims.size();
-    bool swap_hw = operation_attributes.dims[N - 2] == N - 1 && operation_attributes.dims[N - 1] == N - 2;
+    uint32_t rank = operation_attributes.dims.size();
+    bool swap_hw = operation_attributes.dims[rank - 2] == rank - 1 && operation_attributes.dims[rank - 1] == rank - 2;
 
     auto compute_with_storage_grid_size = input_tensor.device()->compute_with_storage_grid_size();
     auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
@@ -126,7 +126,7 @@ PermuteDeviceOperation::MultiCoreTileInvariant::cached_program_t PermuteDeviceOp
     }
 
     bool src_is_dram = src_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
-    std::vector<uint32_t> reader_compile_time_args = {(uint32_t)src_is_dram, N, input_page_size, num_tiles};
+    std::vector<uint32_t> reader_compile_time_args = {(uint32_t)src_is_dram, rank, input_page_size, num_tiles};
 
     tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
         program,
@@ -236,4 +236,223 @@ void PermuteDeviceOperation::MultiCoreTileInvariant::override_runtime_arguments(
     }
 }
 
+PermuteDeviceOperation::MultiCoreTileRowInvariant::cached_program_t
+PermuteDeviceOperation::MultiCoreTileRowInvariant::create(
+    const operation_attributes_t& operation_attributes,
+    const tensor_args_t& tensor_args,
+    tensor_return_value_t& tensor_return_value) {
+    using namespace tt;
+    using namespace tt::tt_metal;
+    const std::optional<float> pad_value = operation_attributes.pad_value;
+
+    const auto& input_tensor = tensor_args.input_tensor;
+    const auto& input_shape = input_tensor.get_logical_shape();
+    const auto& dims = operation_attributes.dims;
+    auto& output_tensor = tensor_return_value;
+
+    auto src_buffer = input_tensor.buffer();
+    auto dst_buffer = output_tensor.buffer();
+
+    tt::tt_metal::Program program{};
+
+    tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype());
+    uint32_t input_page_size = detail::tile_size(input_tensor);
+
+    tt::DataFormat cb_data_format_output = tt::tt_metal::datatype_to_dataformat_converter(output_tensor.get_dtype());
+    uint32_t output_page_size = detail::tile_size(tensor_return_value);
+
+    uint32_t num_tiles = detail::num_tiles(input_tensor);
+    uint32_t num_output_tiles = detail::num_tiles(tensor_return_value);
+
+    tt::tt_metal::IDevice* device = input_tensor.device();
+
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
+    uint32_t padding_cb_index = tt::CBIndex::c_1;
+    uint32_t output_cb_index = src0_cb_index;
+
+    uint32_t num_input_pages_to_read = 2;
+
+    uint32_t rank = operation_attributes.dims.size();
+
+    const auto& tile_shape = input_tensor.get_tensor_spec().tile().get_tile_shape();
+
+    uint32_t padded_num_tensor_tiles = num_output_tiles / (output_tensor.get_padded_shape()[rank - 2] /
+                                                           tile_shape[0]);  // only last row of Xt should have padding
+
+    auto compute_with_storage_grid_size = input_tensor.device()->compute_with_storage_grid_size();
+    // CoreCoord compute_with_storage_grid_size = {1u, 1u};
+    auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
+        tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_tiles);
+    auto
+        [padded_num_cores,
+         padded_all_cores,
+         padded_core_group_1,
+         padded_core_group_2,
+         padded_num_tiles_per_core_group_1,
+         padded_num_tiles_per_core_group_2] =
+            tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, padded_num_tensor_tiles);
+
+    all_cores = num_cores > padded_num_cores ? all_cores : padded_all_cores;
+
+    tt::tt_metal::CircularBufferConfig cb_src0_config =
+        tt::tt_metal::CircularBufferConfig(num_input_pages_to_read * input_page_size, {{src0_cb_index, cb_data_format}})
+            .set_page_size(src0_cb_index, input_page_size);
+    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
+
+    bool src_is_dram = src_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+
+    uint32_t output_H = input_shape[dims[rank - 2]];
+    bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    uint32_t element_size = input_tensor.element_size();
+
+    const auto& face_shape = input_tensor.get_tensor_spec().tile().get_face_shape();
+
+    bool needs_padding = (output_H % tile_shape[1] != 0) && pad_value.has_value();
+    if (needs_padding) {
+        tt::tt_metal::CircularBufferConfig cb_src1_config =
+            tt::tt_metal::CircularBufferConfig(face_shape[1] * element_size, {{padding_cb_index, cb_data_format}})
+                .set_page_size(padding_cb_index, face_shape[1] * element_size);
+        auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
+    }
+    uint32_t padding_val_packed = 0;
+    uint32_t num_writes = 0;
+    if (pad_value.has_value()) {
+        if (output_H % tile_shape[1] != 0) {
+            uint32_t num_packed_values = sizeof(uint32_t) / element_size;
+            num_writes = face_shape[1] / num_packed_values;
+            if (input_tensor.get_dtype() == DataType::BFLOAT16) {
+                padding_val_packed =
+                    pack_two_bfloat16_into_uint32({bfloat16(pad_value.value()), bfloat16(pad_value.value())});
+            } else if (num_packed_values == 2) {
+                padding_val_packed =
+                    static_cast<uint32_t>(pad_value.value()) | (static_cast<uint32_t>(pad_value.value()) << 16);
+            } else {
+                padding_val_packed = std::bit_cast<uint32_t>(pad_value.value());
+            }
+        }
+    }
+
+    uint32_t h_in_dest = 0;
+    for (uint32_t i = 0; i < rank; i++) {
+        if (dims[i] == rank - 2) {
+            h_in_dest = i;
+            break;
+        }
+    }
+
+    std::vector<uint32_t> reader_compile_time_args = {
+        (uint32_t)src_is_dram, num_writes, padding_val_packed, (uint32_t)needs_padding};
+
+    tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/"
+        "reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp",
+        all_cores,
+        tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
+
+    std::vector<uint32_t> writer_compile_time_args = {
+        (std::uint32_t)dst_is_dram,
+        element_size,
+        output_cb_index,
+        output_H,
+        input_shape[rank - 2],
+        input_shape[rank - 1],
+        tile_shape[0],
+        tile_shape[1],
+        face_shape[0],
+        face_shape[1],
+        (uint32_t)needs_padding,
+        rank,
+        h_in_dest};
+
+    tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/"
+        "writer_permute_interleaved_tiled_row_invariant.cpp",
+        all_cores,
+        tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args));
+
+    auto input_shape_view = input_shape.view();
+
+    std::vector<uint32_t> reader_runtime_args = {src_buffer->address(), 0, 0};
+
+    std::vector<uint32_t> writer_runtime_args = {dst_buffer->address(), 0, 0, 0, 0};
+    writer_runtime_args.insert(writer_runtime_args.end(), input_shape_view.begin(), input_shape_view.end());
+    writer_runtime_args.insert(
+        writer_runtime_args.end(), operation_attributes.dims.begin(), operation_attributes.dims.end());
+
+    auto cores = corerange_to_cores(all_cores, std::nullopt);
+    uint32_t start_tile = 0;
+    uint32_t num_tiles_per_core = 0;
+    uint32_t start_tile_padding = 0;
+    uint32_t num_tiles_per_core_padding = 0;
+    uint32_t end_tile_padding = 0;
+    for (const auto& core : cores) {
+        if (core_group_1.contains(core)) {
+            num_tiles_per_core = num_tiles_per_core_group_1;
+        } else if (core_group_2.contains(core)) {
+            num_tiles_per_core = num_tiles_per_core_group_2;
+        } else {
+            // no-op
+            num_tiles_per_core = 0;
+        }
+        if (needs_padding) {
+            if (padded_core_group_1.contains(core)) {
+                num_tiles_per_core_padding = padded_num_tiles_per_core_group_1;
+            } else if (padded_core_group_2.contains(core)) {
+                num_tiles_per_core_padding = padded_num_tiles_per_core_group_2;
+            } else {
+                // no-op
+                num_tiles_per_core_padding = 0;
+            }
+        }
+        uint32_t end_tile = start_tile + num_tiles_per_core;
+        reader_runtime_args[1] = num_tiles_per_core;
+        reader_runtime_args[2] = start_tile;
+
+        writer_runtime_args[1] = start_tile;  // for some reason num_tiles comes first in writer unary
+        writer_runtime_args[2] = end_tile;    // start tile is second in writer unary
+        if (needs_padding) {
+            end_tile_padding = start_tile_padding + num_tiles_per_core_padding;
+            writer_runtime_args[3] = start_tile_padding;
+            writer_runtime_args[4] = end_tile_padding;
+        }
+
+        tt::tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_runtime_args);
+        tt::tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_runtime_args);
+
+        start_tile = end_tile;
+        start_tile_padding = end_tile_padding;
+    }
+
+    return {
+        std::move(program),
+        {.unary_reader_kernel_id = unary_reader_kernel_id, .unary_writer_kernel_id = unary_writer_kernel_id}};
+}
+
+void PermuteDeviceOperation::MultiCoreTileRowInvariant::override_runtime_arguments(
+    cached_program_t& cached_program,
+    const operation_attributes_t& operation_attributes,
+    const tensor_args_t& tensor_args,
+    tensor_return_value_t& tensor_return_value) {
+    auto& program = cached_program.program;
+    auto& unary_reader_kernel_id = cached_program.shared_variables.unary_reader_kernel_id;
+    auto& unary_writer_kernel_id = cached_program.shared_variables.unary_writer_kernel_id;
+
+    const auto& input_tensor = tensor_args.input_tensor;
+    auto& output_tensor = tensor_return_value;
+
+    auto src_buffer = input_tensor.buffer();
+    auto dst_buffer = output_tensor.buffer();
+    auto& all_cores = cached_program.shared_variables.core_range;
+
+    auto cores = corerange_to_cores(all_cores, std::nullopt);
+    for (const auto& core : cores) {
+        auto& runtime_args = tt::tt_metal::GetRuntimeArgs(program, unary_reader_kernel_id, core);
+        runtime_args[0] = src_buffer->address();
+        auto& runtime_args_writer = tt::tt_metal::GetRuntimeArgs(program, unary_writer_kernel_id, core);
+        runtime_args_writer[0] = dst_buffer->address();
+    }
+}
+
 }  // namespace ttnn::operations::data_movement
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp
index 9fd86ff17dc..c42db8cbf96 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -8,14 +8,14 @@
 #include "ttnn/operations/data_movement/transpose/transpose.hpp"
 #include "ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
 
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/run_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/copy.hpp"
+#include "cpp/ttnn/operations/copy.hpp"
 
 namespace ttnn::operations::data_movement {
 namespace detail {
@@ -30,19 +30,17 @@ ttnn::Tensor permute_impl(
     const ttnn::SmallVector<uint32_t>& dims,
     const MemoryConfig& output_mem_config,
     const std::optional<float>& pad_value) {
-    using ttnn::operations::experimental::auto_format::AutoFormat;
-
     // Get the device
     IDevice* device = a.device();
     uint32_t rank = a.get_shape().rank();
 
     auto prim_permute = [&](const ttnn::Tensor& input) -> ttnn::Tensor {
-        return ttnn::prim::permute(input, dims, output_mem_config, std::nullopt);
+        return ttnn::prim::permute(input, dims, output_mem_config, std::nullopt, pad_value);
     };
 
     if (rank > 4) {
-        if (a.get_layout() == Layout::TILE && ((dims[rank - 1] == rank - 1 && dims[rank - 2] == rank - 2)) ||
-            (dims[rank - 1] == rank - 2 && dims[rank - 2] == rank - 1)) {
+        if (a.get_layout() == Layout::TILE &&
+            ((dims[rank - 1] == rank - 1) || (dims[rank - 1] == rank - 2 && dims[rank - 2] == rank - 1))) {
             return prim_permute(a);
         }
         auto input = a.get_layout() == Layout::TILE
@@ -98,7 +96,7 @@ ttnn::Tensor permute_impl(
     } else if (N == 1 && C == 0 && H == 3 && W == 2) {
         output = prim_permute(formatted_input_tensor);
     } else if (N == 1 && C == 2 && H == 0 && W == 3) {
-        output = transpose_hc(transpose_cn(formatted_input_tensor));
+        output = prim_permute(formatted_input_tensor);
     } else if (N == 1 && C == 2 && H == 3 && W == 0) {
         output = transpose_wh(transpose_hc(transpose_cn(formatted_input_tensor)));
     } else if (N == 1 && C == 3 && H == 0 && W == 2) {
@@ -106,11 +104,11 @@ ttnn::Tensor permute_impl(
     } else if (N == 1 && C == 3 && H == 2 && W == 0) {
         output = transpose_wh(transpose_hc(transpose_wh(transpose_cn(formatted_input_tensor))));
     } else if (N == 2 && C == 0 && H == 1 && W == 3) {
-        output = transpose_cn(transpose_hc(formatted_input_tensor));
+        output = prim_permute(formatted_input_tensor);
     } else if (N == 2 && C == 0 && H == 3 && W == 1) {
         output = transpose_wh(transpose_cn(transpose_hc(formatted_input_tensor)));
     } else if (N == 2 && C == 1 && H == 0 && W == 3) {
-        output = transpose_cn(transpose_hc(transpose_cn(formatted_input_tensor)));
+        output = prim_permute(formatted_input_tensor);
     } else if (N == 2 && C == 1 && H == 3 && W == 0) {
         output = transpose_wh(transpose_cn(transpose_hc(transpose_cn(formatted_input_tensor))));
     } else if (N == 2 && C == 3 && H == 0 && W == 1) {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.hpp
index ea7d2c449fd..1f11a2c3e7a 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "permute.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
index 53c244c9c49..18e2f75b99c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operation.hpp"
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.hpp
index 70a08c8624d..0e8e3260f90 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.hpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::data_movement::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp
index 1918167da4e..c111a1717c0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat.cpp
@@ -5,7 +5,7 @@
 #include "ttnn/operations/data_movement/repeat/repeat.hpp"
 
 #include "device/repeat_op.hpp"
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/math.hpp>
 #include "ttnn/common/constants.hpp"
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/data_movement/pad/pad.hpp"
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat_pybind.cpp
index 803dbca940c..2474911f4f2 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/repeat_pybind.cpp
@@ -5,7 +5,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "repeat.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave.cpp
index c0ef5af4330..1609fded933 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave.cpp
@@ -4,10 +4,10 @@
 
 #include "repeat_interleave.hpp"
 
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp"
-#include "ttnn/cpp/ttnn/operations/copy.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.hpp"
+#include "cpp/ttnn/operations/data_movement/transpose/transpose.hpp"
+#include "cpp/ttnn/operations/copy.hpp"
 
 namespace ttnn {
 namespace operations {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave.hpp b/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave.hpp
index 0e6171be0b3..5ad97815941 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "ttnn/tensor/types.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/concat/concat.hpp"
+#include "cpp/ttnn/operations/data_movement/concat/concat.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/data_movement/permute/permute.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave_pybind.hpp
index 29462ded65c..10e7b5f53ba 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat_interleave/repeat_interleave_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "repeat_interleave.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_op.cpp
index 755be86e712..6ebce384144 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_op.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "reshape_op.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
 
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "reshape_program_factory.hpp"
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp
index 896114d5c34..c8632ef2c50 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/experimental/hal.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/hal_exp.hpp>
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operation.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp
index 4404a7268d8..bb9dce28760 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp
@@ -5,7 +5,7 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/run_operation.hpp"
 #include "reshape.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include <ttnn/operations/functions.hpp>
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape_pybind.cpp
index 6c362202fdc..e2774d475f5 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/device/rm_reshape_interleaved.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/device/rm_reshape_interleaved.cpp
index 3a66652597c..57a6d2894fd 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/device/rm_reshape_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/device/rm_reshape_interleaved.cpp
@@ -30,7 +30,7 @@ Runtime arguments
 #include <stdint.h>
 #include "dataflow_api.h"
 #include "debug/dprint.h"  // required in all kernels using DPRINT
-#include "ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
+#include "cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
 
 void kernel_main() {
     //We are guranteed to be in 2D going to 2D
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/host/reshape_rm_host_prep.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/host/reshape_rm_host_prep.cpp
index 32d5bbc0c7d..9f98ea5d915 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/host/reshape_rm_host_prep.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/host/reshape_rm_host_prep.cpp
@@ -8,10 +8,10 @@
 #include "ttnn/operations/math.hpp"
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/core/work_split/work_split_tilize.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_common.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include "cpp/ttnn/operations/data_movement/reshape_view/reshape_common.hpp"
 
 #include <optional>
 #include <variant>
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/reshape_rm_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/reshape_rm_op.cpp
index 65c0ca38017..5bf2cf4b573 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/reshape_rm_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/reshape_rm_op.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/operations/data_movement/reshape_view/device/reshape_rm_op.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 #include <cstdint>
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
index d83df38b126..50fffb4faa8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
@@ -7,17 +7,17 @@
 #include "ttnn/run_operation.hpp"
 #include "reshape.hpp"
 #include "reshape_common.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include <functional>
 #include <ttnn/operations/functions.hpp>
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
 #include "ttnn/operations/data_movement/data_transfer/data_transfer.hpp"
 #include "ttnn/operations/data_movement/slice/slice.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "device/reshape_rm_op.hpp"
-#include "ttnn/cpp/ttnn/operations/copy.hpp"
+#include "cpp/ttnn/operations/copy.hpp"
 #include "ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.hpp"
 #include "ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp"
 #include "ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp"
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp
index 2b757878e4c..2fee4b1153a 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/data_movement/reshape_view/reshape.hpp"
 #include "ttnn/types.hpp"
 #include "ttnn/operations/data_movement/reshape_view/reshape_common.hpp"
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.cpp
index 9ba0559c215..0c10736ed3d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.cpp
@@ -4,7 +4,7 @@
 
 #include "interleaved_to_sharded_op.hpp"
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 #include "interleaved_to_sharded_program_factory.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp
index 07e6e8aafc2..adaca69a084 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_common.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include "cpp/ttnn/operations/data_movement/sharded/sharded_common.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp"
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp
index 50c0d528164..73d291c9e11 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp
@@ -6,7 +6,7 @@
 #include "ttnn/run_operation.hpp"
 #include "device/interleaved_to_sharded_op.hpp"
 #include "interleaved_to_sharded.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp
index e93b57ccc00..cf33a13756c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "ttnn/decorators.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn {
 namespace operations::data_movement {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp
index 0480306f7e8..694e46c202b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded_pybind.cpp
@@ -5,10 +5,10 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "interleaved_to_sharded.hpp"
 #include "ttnn/types.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.cpp
index c6a906eb99a..8d79b5855ed 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.cpp
@@ -7,9 +7,9 @@
 #include <magic_enum/magic_enum.hpp>
 
 #include "reshard_program_factory.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp
index b8047450ef4..653fa6de28b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp
@@ -6,11 +6,11 @@
 
 #include <algorithm>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include "cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.hpp"
 #include "ttnn/operations/math.hpp"
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp
index 79018d77109..101aceee271 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "ttnn/decorators.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn {
 namespace operations::data_movement {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp
index 7fc6e38eed3..af64f207b8b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard_pybind.cpp
@@ -5,10 +5,10 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "reshard.hpp"
 #include "ttnn/types.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_common.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_common.cpp
index 726bd47ba06..c48567d60f4 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_common.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_common.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
 #include "ttnn/tensor/tensor.hpp"
 
 namespace ttnn::operations::data_movement::detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.cpp
index dfd055e510c..3e74c47fafb 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.cpp
@@ -4,7 +4,7 @@
 
 #include "sharded_to_interleaved_op.hpp"
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 #include "sharded_to_interleaved_program_factory.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp
index 7ed32adeabe..970bb30ad1c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_common.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include "cpp/ttnn/operations/data_movement/sharded/sharded_common.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.hpp"
 
 using namespace tt;
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp
index ee08a332a75..2c3dfe5db6d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp
@@ -5,7 +5,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "sharded_to_interleaved.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.cpp
index 71f69cab2f7..53d018252e8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.cpp
@@ -4,9 +4,9 @@
 
 #include "interleaved_to_sharded_partial_op.hpp"
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.hpp"
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp
index 6bc860e8e01..78b34a79073 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp
@@ -7,8 +7,8 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/run_operation.hpp"
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
 namespace ttnn::operations::data_movement {
 
 struct InterleavedToShardedPartialDeviceOperation {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp
index fe4d71335e7..4c4e030c4c6 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.cpp
@@ -6,7 +6,7 @@
 #include "ttnn/run_operation.hpp"
 #include "device/interleaved_to_sharded_partial_op.hpp"
 #include "interleaved_to_sharded_partial.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp
index cae635009f0..bf482c35d1b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "ttnn/decorators.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn {
 namespace operations::data_movement {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp
index 239f8335278..459e9c1b4cc 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.cpp
@@ -5,10 +5,10 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "interleaved_to_sharded_partial.hpp"
 #include "ttnn/types.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.cpp
index 9dae41aa4d3..427a2fe61f2 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.cpp
@@ -4,9 +4,9 @@
 
 #include "sharded_to_interleaved_partial_op.hpp"
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
-#include "ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.hpp"
+#include "cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.hpp"
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial_pybind.cpp
index 2f3390a70dc..e13e6cadd9b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial_pybind.cpp
@@ -5,7 +5,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "sharded_to_interleaved_partial.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
index cd48ab5b4f1..df30bb067e7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "slice_op.hpp"
 #include "slice_program_factory.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
index f6feb88b433..7be8d8e28ce 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
@@ -4,10 +4,10 @@
 
 #include "optional"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 #include "slice_op.hpp"
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp
index 25abbd872ee..f1f1dff1c9f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::data_movement::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp
index 4d598c60d47..f97bf225ff6 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/slice.cpp
@@ -7,11 +7,11 @@
 #include "device/slice_op.hpp"
 #include "ttnn/run_operation.hpp"
 #include "ttnn/operations/core/core.hpp"
-#include "ttnn/cpp/ttnn/operations/creation.hpp"
+#include "cpp/ttnn/operations/creation.hpp"
 #include "ttnn/common/constants.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/copy/copy.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/common/common.hpp"
+#include "cpp/ttnn/operations/data_movement/copy/copy.hpp"
+#include "cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.hpp"
+#include "cpp/ttnn/operations/data_movement/common/common.hpp"
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/slice_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/slice/slice_pybind.hpp
index 77c4836a11c..1d983ce0ea3 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/slice_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/slice_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "slice.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/device/split_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/split/device/split_op.cpp
index 3d2e5ba363d..4fd76004eb6 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/split/device/split_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/split/device/split_op.cpp
@@ -4,8 +4,8 @@
 
 #include "split_op.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
 
 #include "split_program_factory.hpp"
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/device/split_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/split/device/split_program_factory.cpp
index 24600523fd1..35b8bbc88ee 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/split/device/split_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/split/device/split_program_factory.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operation.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp b/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp
index 0253a44f89b..6d5d38db3d0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/split/split.cpp
@@ -6,7 +6,7 @@
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/run_operation.hpp"
 #include "device/split_op.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
 #include "ttnn/operations/data_movement/transpose/transpose.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/operations/data_movement/split/split.hpp"
diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/split_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/split/split_pybind.hpp
index 8b2b8dbbc6c..fd0230edea7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/split/split_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/split/split_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "split.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/squeeze/squeeze_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/squeeze/squeeze_pybind.cpp
index 378f56ec1d1..c4e356c8a7a 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/squeeze/squeeze_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/squeeze/squeeze_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/data_movement/squeeze/squeeze.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.cpp
index f5d4592450b..5b646cbf2d2 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.cpp
@@ -5,7 +5,7 @@
 #include "tilize_op.hpp"
 #include "tilize_program_factory.hpp"
 #include "ttnn/run_operation.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
index b219b75f915..6d284887da3 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
@@ -8,9 +8,9 @@
 #include "ttnn/operations/math.hpp"
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/core/work_split/work_split_tilize.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize_pybind.hpp
index c64fc23a269..ea55e0a39a5 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize_pybind.hpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "tilize.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::data_movement::detail {
 namespace py = pybind11;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
index a0f65503296..ba0f4adf50b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
@@ -8,9 +8,9 @@
 #include "ttnn/operations/math.hpp"
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/core/work_split/work_split_tilize.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_common.hpp"
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp
index 81e8a22baf9..ccbe887b68c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_common.hpp"
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_pybind.hpp
index a1f408e2b88..164380cb77a 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding_pybind.hpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "tilize_with_val_padding.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::data_movement::detail {
 namespace py = pybind11;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
index 4be36b798fa..b96a17ac0a8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
+#include "cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_hc_interleaved_tiled_padding_aware.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
index d7ba20e40a9..4b2024faeb3 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
+#include "cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
 
 void kernel_main() {
     // Retrieve arguments
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp
index 3e39ef81690..ab7b9e9bcd2 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp
@@ -5,9 +5,9 @@
 #include "transpose_op.hpp"
 #include "ttnn/operations/data_movement/permute/permute.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/experimental/hal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/hal_exp.hpp>
 
 #include "transpose_program_factory.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp
index 0f5ad600994..da78027637c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp
@@ -1,15 +1,15 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/experimental/hal.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/hal_exp.hpp>
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_log.h"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/tt_log.h>
 #include "ttnn/operation.hpp"
 
 using namespace tt::tt_metal;
@@ -566,7 +566,6 @@ operation::ProgramWithCallbacks transpose_hc_multi_core_tiled_interleaved(
                     static_cast<uint32_t>(pad_value.value()) | (static_cast<uint32_t>(pad_value.value()) << 16);
             } else {
                 padding_val_packed = std::bit_cast<uint32_t>(pad_value.value());
-                ;
             }
         }
     }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp
index d778d6d5313..c11535f1981 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp
@@ -9,11 +9,11 @@
 #include "ttnn/operations/data_movement/permute/permute.hpp"
 #include "ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
 #include "ttnn/operations/data_movement/transpose/transpose.hpp"
-#include "ttnn/cpp/ttnn/operations/copy.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/slice/slice.hpp"
+#include "cpp/ttnn/operations/copy.hpp"
+#include "cpp/ttnn/operations/data_movement/pad/pad.hpp"
+#include "cpp/ttnn/operations/data_movement/slice/slice.hpp"
 
-#include "tt_metal/experimental/hal.hpp"
+#include <tt-metalium/hal_exp.hpp>
 
 using namespace tt::tt_metal::experimental;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.hpp
index 41607f5708b..094c213e0a8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "transpose.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze_pybind.cpp
index acb6c675c48..ea75e7b1aeb 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/data_movement/unsqueeze/unsqueeze.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp
index 13eb96acf1a..09dd8f224c1 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp
@@ -5,7 +5,7 @@
 #include "untilize_op.hpp"
 
 #include "ttnn/run_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "untilize_program_factory.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
index 386464fff9d..df28b919370 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
@@ -11,10 +11,10 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/core/work_split/work_split_tilize.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize_pybind.hpp
index 2d4c141d415..e3a5488cd08 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize_pybind.hpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "untilize.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::data_movement::detail {
 namespace py = pybind11;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp
index 9e6b6c4f97d..cfdf0f2efa1 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp
@@ -5,7 +5,7 @@
 #include "untilize_with_halo_v2_op.hpp"
 
 #include "ttnn/run_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "untilize_with_halo_v2_program_factory.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
index 94de584b0bf..476ffd05bba 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
@@ -8,9 +8,9 @@
 
 #include "ttnn/operations/cb_utils.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/common/constants.hpp"
 #include "ttnn/operation.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2_pybind.hpp
index eb34dc33231..7bcbf0ead69 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "untilize_with_halo_v2.hpp"
 
 namespace ttnn::operations::data_movement::detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
index 58b4ffeb4c2..39c7b3781b7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
@@ -9,9 +9,9 @@
 #include "ttnn/operations/cb_utils.hpp"
 #include "ttnn/operations/math.hpp"
 #include "ttnn/operations/core/work_split/work_split_tilize.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/common/constants.hpp"
 #include "ttnn/operation.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding_pybind.hpp
index 3d2722f888c..e399a53640f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "untilize_with_unpadding.hpp"
 
 namespace ttnn::operations::data_movement::detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/view/view.cpp b/ttnn/cpp/ttnn/operations/data_movement/view/view.cpp
index 22d80982266..f1af9f13625 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/view/view.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/view/view.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "view.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.cpp
index 5e5323342d1..62fe2073e3d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/data_movement/view/view.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
index 24475b40ff7..000d0726c22 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
@@ -7,11 +7,11 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/eltwise/binary/binary_composite.hpp"
 #include "ttnn/types.hpp"
-#include "ttnn/cpp/pybind11/export_enum.hpp"
+#include "cpp/pybind11/export_enum.hpp"
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp
index b91a8e0491e..8bb60a72a3e 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp
@@ -4,9 +4,9 @@
 
 #include "binary_op_utils.hpp"
 
-#include "tt_metal/common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
-#include "ttnn/cpp/ttnn/tensor/types.hpp"
+#include "cpp/ttnn/tensor/types.hpp"
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
index 153b3a32130..2611c5a106c 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
@@ -8,16 +8,16 @@
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
 #include "ttnn/types.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/experimental/hal.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/hal_exp.hpp>
 #include "ttnn/operations/eltwise/binary/binary_composite.hpp"
-#include "ttnn/cpp/ttnn/operations/eltwise/ternary/where.hpp"
-#include "ttnn/cpp/ttnn/operations/copy.hpp"
+#include "cpp/ttnn/operations/eltwise/ternary/where.hpp"
+#include "cpp/ttnn/operations/copy.hpp"
 #include "ttnn/operations/eltwise/unary/unary_composite.hpp"
 #include "ttnn/operations/data_movement/pad/pad.hpp"
 #include "ttnn/operations/matmul/matmul.hpp"
 #include "ttnn/operations/creation.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
+#include "cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 
 namespace ttnn::operations::binary {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp
index d744b8ae836..f82e96b70f4 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp
@@ -6,9 +6,9 @@
 
 #include <utility>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp
index 520ad6b436a..6bab39e1e0e 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp
@@ -9,8 +9,8 @@
 #include <optional>
 #include <variant>
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "ttnn/common/constants.hpp"
 #include "ttnn/core.hpp"
 #include "ttnn/decorators.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
index a985f6d1ca2..475751e2b6c 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
@@ -5,11 +5,11 @@
 #include <optional>
 
 #include "binary_device_operation.hpp"
-#include "impl/buffers/buffer.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/cb_utils.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
index 89512046077..1b7bf9deb7d 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
@@ -5,10 +5,10 @@
 #include "binary_device_operation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/device_operation.hpp"
 
 namespace ttnn::operations::binary {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp
index cf8ed46efb6..12f6f7eaa46 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp
@@ -5,10 +5,10 @@
 #include "binary_device_operation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
-// #include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+// #include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 // #include "ttnn/device_operation.hpp"
 
 namespace ttnn::operations::binary {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_program_factory.cpp
index ee6d499277f..35de6bd99ee 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_program_factory.cpp
@@ -5,10 +5,10 @@
 #include "binary_device_operation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
-// #include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+// #include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 // #include "ttnn/device_operation.hpp"
 
 namespace ttnn::operations::binary {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
index b87b5682ba8..b3609641444 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
@@ -5,10 +5,10 @@
 #include "binary_device_operation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/device_operation.hpp"
 
 namespace ttnn::operations::binary {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
index e8deb99eb79..a9ec0e1e5a8 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
@@ -5,14 +5,14 @@
 #include <algorithm>
 
 #include "binary_device_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary/device/eltwise_multi_core_program_factory_common.hpp"
+#include "cpp/ttnn/operations/eltwise/binary/device/eltwise_multi_core_program_factory_common.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp"
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::binary {
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp
index aaac2f52c85..286378d2652 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp
@@ -5,14 +5,14 @@
 #include <algorithm>
 
 #include "binary_device_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary/device/eltwise_multi_core_program_factory_common.hpp"
+#include "cpp/ttnn/operations/eltwise/binary/device/eltwise_multi_core_program_factory_common.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp"
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::binary {
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/eltwise_multi_core_program_factory_common.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/eltwise_multi_core_program_factory_common.hpp
index 199a778fdf3..7ae246eec26 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/eltwise_multi_core_program_factory_common.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/eltwise_multi_core_program_factory_common.hpp
@@ -9,11 +9,11 @@
 #include "binary_device_operation.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp"
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::binary {
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp
index ea9124f3673..98f97f61f6d 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp
@@ -8,7 +8,7 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 void kernel_main() {
     auto src0_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp
index 60d20d4fd78..74a4f73e586 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp
@@ -12,20 +12,20 @@
 #include "ttnn/operations/data_movement/copy/copy.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
 #include "ttnn/operations/eltwise/unary/device/unary_composite_op.hpp"
-#include "ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp"
+#include "cpp/ttnn/operations/eltwise/unary/unary_composite.hpp"
 #include "ttnn/operations/eltwise/binary/binary_composite.hpp"
 #include "ttnn/operations/eltwise/unary_backward/unary_backward.hpp"
 #include "ttnn/operations/eltwise/binary_backward/binary_backward.hpp"
 #include "ttnn/operations/eltwise/complex_unary/complex_unary.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "ttnn/cpp/ttnn/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
+#include "cpp/ttnn/common/constants.hpp"
 #include "ttnn/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/eltwise/ternary/where.hpp"
 #include "ttnn/operations/creation.hpp"
 #include "ttnn/common/constants.hpp"
 #include "ttnn/operations/eltwise/binary_backward/binary_backward.hpp"
+#include "tools/profiler/op_profiler.hpp"
 #include <magic_enum/magic_enum.hpp>
 #include <utility>
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp
index 87eb44173e9..a12f8f218bb 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp
@@ -7,11 +7,11 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/eltwise/binary_backward/binary_backward.hpp"
 #include "ttnn/operations/eltwise/ternary_backward/ternary_backward.hpp"
 #include "ttnn/types.hpp"
-#include "ttnn/cpp/ttnn/common/constants.hpp"
+#include "cpp/ttnn/common/constants.hpp"
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp
index 3121cc2e7ae..8429cf49e43 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "binary_ng_utils.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/cb_utils.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_utils.cpp
index 7d4410f7b3f..a18988b3a0a 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_utils.cpp
@@ -4,7 +4,7 @@
 
 #include "binary_ng_utils.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
-#include "tt_metal/common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 
 #include <fmt/core.h>
 #include <fmt/format.h>
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_col_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_col_bcast.cpp
index cf0a1e4e13d..68522df15d5 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_col_bcast.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_col_bcast.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
+#include "cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_row_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_row_bcast.cpp
index 7278e6b3510..5ccc745d1f4 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_row_bcast.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_row_bcast.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
+#include "cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_scalar_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_scalar_bcast.cpp
index d1f0922d648..d8925f5fee0 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_scalar_bcast.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_scalar_bcast.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
+#include "cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
 
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_col_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_col_bcast.cpp
index f17b23684ff..d1bdd6184a2 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_col_bcast.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_col_bcast.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
+#include "cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_row_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_row_bcast.cpp
index 65ff8e60f69..a6d877ac476 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_row_bcast.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_row_bcast.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
+#include "cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp
index 452d1dafaa7..772e651e64d 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
+#include "cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
 
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar_bcast.cpp
index 18915373d25..2fe46d9df0c 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar_bcast.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar_bcast.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
+#include "cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/complex/complex.hpp b/ttnn/cpp/ttnn/operations/eltwise/complex/complex.hpp
index 38b9aa3b4ef..839f18baca4 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/complex/complex.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/complex/complex.hpp
@@ -9,7 +9,7 @@
 
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/decorators.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/reflection.hpp>
 
 namespace ttnn {
 namespace operations::complex {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/complex_binary/complex_binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/complex_binary/complex_binary_pybind.hpp
index e5e22b29557..ab55f1f927d 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/complex_binary/complex_binary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/complex_binary/complex_binary_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/eltwise/complex/complex.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/complex_binary/device/complex_binary_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/complex_binary/device/complex_binary_op.cpp
index 2bd96f84812..c86fe37153a 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/complex_binary/device/complex_binary_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/complex_binary/device/complex_binary_op.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
+#include "tools/profiler/op_profiler.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/eltwise/complex/complex.hpp"
 #include "ttnn/operations/eltwise/complex_unary/device/complex_unary_op.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/complex_unary/complex_unary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/complex_unary/complex_unary_pybind.hpp
index 4bf6b62e392..a1ef5d4707e 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/complex_unary/complex_unary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/complex_unary/complex_unary_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/eltwise/complex_unary/complex_unary.hpp"
 #include "ttnn/operations/eltwise/complex/complex.hpp"
 #include "ttnn/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/complex_unary/device/complex_unary_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/complex_unary/device/complex_unary_op.cpp
index 74e1c025176..7bde600dbd0 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/complex_unary/device/complex_unary_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/complex_unary/device/complex_unary_op.cpp
@@ -4,9 +4,9 @@
 
 #include "complex_unary_op.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
+#include "tools/profiler/op_profiler.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/eltwise/binary/binary_composite.hpp"
 #include "ttnn/operations/eltwise/complex/complex.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/complex_unary_backward/complex_unary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/complex_unary_backward/complex_unary_backward_pybind.hpp
index a809dd0785d..a98a4b226fa 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/complex_unary_backward/complex_unary_backward_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/complex_unary_backward/complex_unary_backward_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/eltwise/complex_unary_backward/complex_unary_backward.hpp"
 #include "ttnn/operations/eltwise/complex/complex.hpp"
 #include "ttnn/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_composite_op.hpp b/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_composite_op.hpp
index 1b5f342ddbf..13b5f1990f1 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_composite_op.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_composite_op.hpp
@@ -12,7 +12,7 @@
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/run_operation.hpp"
 #include "ttnn/types.hpp"
-#include "tt_metal/common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 
 #include "where.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_pybind.hpp
index ce6e0126a11..8448bfb3817 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/ternary/ternary_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/eltwise/ternary/ternary_composite.hpp"
 #include "ttnn/operations/eltwise/ternary/where.hpp"
 #include "ttnn/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp
index c792f0f988d..907a4cd27ce 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp
@@ -6,11 +6,11 @@
 #include "ttnn/operations/eltwise/unary/unary.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
-#include "ttnn/cpp/ttnn/operations/eltwise/ternary/where.hpp"
+#include "cpp/ttnn/operations/eltwise/ternary/where.hpp"
 #include "ttnn/operations/eltwise/binary/binary_composite.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
+#include "tools/profiler/op_profiler.hpp"
 #include "ttnn/operations/eltwise/ternary_backward/ternary_backward.hpp"
 #include <magic_enum/magic_enum.hpp>
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward_pybind.hpp
index 3800a52f055..d1dee4e63ae 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/eltwise/ternary_backward/ternary_backward.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp
index 8030ba66c47..f51cabb0ab6 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_types.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <vector>
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/reflection.hpp>
 
 namespace ttnn::operations::unary {
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
index 89167249397..1e084f04b34 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
@@ -4,8 +4,8 @@
 
 #include "unary_op_utils.hpp"
 
-#include "tt_metal/common/assert.hpp"
-#include "ttnn/cpp/ttnn/tensor/types.hpp"
+#include <tt-metalium/assert.hpp>
+#include "cpp/ttnn/tensor/types.hpp"
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp
index 470b6aa7c71..8d6198cbc9c 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp
@@ -9,7 +9,7 @@
 
 #include <magic_enum/magic_enum.hpp>
 #include <utility>
-#include "tt_metal/common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/operations/data_movement/reshape_on_device/reshape.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
 #include "ttnn/operations/functions.hpp"
@@ -22,7 +22,7 @@
 #include "ttnn/run_operation.hpp"
 #include "ttnn/types.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
-#include "tt_metal/experimental/hal.hpp"
+#include <tt-metalium/hal_exp.hpp>
 
 namespace ttnn::operations::unary {
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp
index b35a963d5d5..e5c282d6ffd 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp
@@ -7,7 +7,7 @@
 #include <optional>
 #include "ttnn/tensor/tensor.hpp"
 #include <magic_enum/magic_enum.hpp>
-#include "ttnn/cpp/ttnn/operations/eltwise/ternary/where.hpp"
+#include "cpp/ttnn/operations/eltwise/ternary/where.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp
index 041e41e4ec9..e881b041757 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp
@@ -5,11 +5,11 @@
 #include "unary_device_operation.hpp"
 
 #include <magic_enum/magic_enum.hpp>
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
+#include "tools/profiler/op_profiler.hpp"
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp
index 990ab0bda6a..33522cab33f 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp
@@ -6,10 +6,10 @@
 
 #include "unary_program_factory.hpp"
 
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 
 namespace ttnn::operations::unary::program {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
index 68c2114105f..6213ab292bf 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
@@ -6,9 +6,9 @@
 
 #include <algorithm>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 
 namespace ttnn::operations::unary::program {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp
index c7f1b531c86..ab2823ef4bf 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
 #include "ttnn/operations/eltwise/unary/unary_composite.hpp"
 #include "ttnn/operations/eltwise/complex_unary/complex_unary.hpp"
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp
index 84fda4964e7..93f3e7f3d7a 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp
@@ -5,10 +5,9 @@
 #include <magic_enum/magic_enum.hpp>
 #include <utility>
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/eltwise/unary/unary.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/moreh/moreh_sum/moreh_sum.hpp"
@@ -25,6 +24,7 @@
 #include "ttnn/operations/eltwise/complex_binary/device/complex_binary_op.hpp"
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 #include "ttnn/operations/eltwise/binary/binary_composite.hpp"
+#include "tools/profiler/op_profiler.hpp"
 
 namespace ttnn::operations::unary_backward {
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp
index 5abee661e1a..1a2c61d7e9f 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/eltwise/binary_backward/binary_backward.hpp"
 #include "ttnn/operations/eltwise/unary_backward/unary_backward.hpp"
 #include "ttnn/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/embedding/device/embedding_device_operation.cpp b/ttnn/cpp/ttnn/operations/embedding/device/embedding_device_operation.cpp
index 7d018f31d9d..e5e59121b16 100644
--- a/ttnn/cpp/ttnn/operations/embedding/device/embedding_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding/device/embedding_device_operation.cpp
@@ -4,10 +4,10 @@
 
 #include "ttnn/operations/embedding/device/embedding_device_operation.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/embedding/device/embedding_program_factory.hpp"
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
index bb31bf841ca..e7758d617f9 100644
--- a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
@@ -5,11 +5,11 @@
 #pragma once
 
 #include "ttnn/operations/core/core.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_log.h"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_log.h>
 
 using namespace tt;
 
diff --git a/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embedding_ind_tilized.cpp b/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embedding_ind_tilized.cpp
index a55494677d1..6ea6e8ba851 100644
--- a/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embedding_ind_tilized.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embedding_ind_tilized.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_common.hpp"
+#include "cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_common.hpp"
 #include "debug/dprint.h"
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings.cpp b/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings.cpp
index 82a315b754d..d93782dd959 100644
--- a/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_common.hpp"
+#include "cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_common.hpp"
 
 void kernel_main() {
     const std::uint32_t input_buffer_src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp b/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp
index 31a1954c39f..d9202e63e96 100644
--- a/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_common.hpp"
+#include "cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_common.hpp"
 
 void kernel_main() {
     const uint32_t input_buffer_src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/embedding/embedding_pybind.hpp b/ttnn/cpp/ttnn/operations/embedding/embedding_pybind.hpp
index f5cffefbd61..b384c2916a4 100644
--- a/ttnn/cpp/ttnn/operations/embedding/embedding_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/embedding/embedding_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/embedding/embedding.hpp"
 
 namespace ttnn::operations::embedding {
diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_device_operation.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_device_operation.cpp
index f824df4dd5c..4512747c9cd 100644
--- a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_device_operation.cpp
@@ -4,8 +4,8 @@
 
 #include "ttnn/operations/embedding_backward/device/embedding_backward_device_operation.hpp"
 
-#include "tt_metal/common/constants.hpp"
-#include "ttnn/cpp/ttnn/run_operation.hpp"
+#include <tt-metalium/constants.hpp>
+#include "cpp/ttnn/run_operation.hpp"
 
 using namespace tt::constants;
 using namespace std;
diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp
index a9f740e444e..4dd851f3c38 100644
--- a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/cb_utils.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/embedding_backward/device/embedding_backward_device_operation.hpp"
 
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp
index 80b067236ad..ecf0668f4c3 100644
--- a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward.cpp
@@ -6,7 +6,7 @@
 
 #include <utility>
 
-#include "ttnn/cpp/ttnn/common/constants.hpp"
+#include "cpp/ttnn/common/constants.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/embedding_backward/device/embedding_backward_device_operation.hpp"
 #include "ttnn/run_operation.hpp"
diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward_pybind.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward_pybind.cpp
index d3daeb0d8f1..98df95aef60 100644
--- a/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding_backward/embedding_backward_pybind.cpp
@@ -5,7 +5,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/embedding_backward/embedding_backward_pybind.hpp"
 #include "ttnn/operations/embedding_backward/embedding_backward.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
index f3d5f056eb8..eb84028f0b4 100644
--- a/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "example_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::examples {
 ExampleDeviceOperation::MultiCore::cached_program_t ExampleDeviceOperation::MultiCore::create(
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
index f2d70f9a7de..7320892f9bd 100644
--- a/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "example_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::examples {
 ExampleDeviceOperation::SingleCore::cached_program_t ExampleDeviceOperation::SingleCore::create(
diff --git a/ttnn/cpp/ttnn/operations/examples/example/example_pybind.hpp b/ttnn/cpp/ttnn/operations/examples/example/example_pybind.hpp
index 732ad255e4e..13501ae3f24 100644
--- a/ttnn/cpp/ttnn/operations/examples/example/example_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/examples/example/example_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/examples/example/example.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp
index a926ff9237e..22bf600bdbc 100644
--- a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "example_multiple_return_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::examples {
 ExampleMultipleReturnDeviceOperation::SingleCore::cached_program_t
diff --git a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/example_multiple_return_pybind.cpp b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/example_multiple_return_pybind.cpp
index 9c70330b22e..dd2e437ec03 100644
--- a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/example_multiple_return_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/example_multiple_return_pybind.cpp
@@ -4,7 +4,7 @@
 
 #include "ttnn/operations/examples/example_multiple_return/example_multiple_return_pybind.hpp"
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/examples/example_multiple_return/example_multiple_return.hpp"
 
 namespace py = pybind11;
diff --git a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp
index 48f48242698..3a78323677d 100644
--- a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp
@@ -6,8 +6,8 @@
 
 #include <utility>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/data_movement/clone/clone.hpp"
 #include "ttnn/operations/data_movement/data_transfer/data_transfer.hpp"
 #include "ttnn/operations/data_movement/pad/pad.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.hpp b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.hpp
index 0ed43c9c32e..175f453d714 100644
--- a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.hpp
@@ -5,13 +5,13 @@
 #pragma once
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operation.hpp"
 
 #include <optional>
 
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/math.hpp>
 
 namespace ttnn::operations::experimental::auto_format {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp
index 63d7e8e07a7..6b70ebe95bc 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.cpp
@@ -6,7 +6,7 @@
 #include <utility>
 #include "ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp"
 #include "ttnn/distributed/types.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include "cpp/ttnn/global_semaphore.hpp"
 
 namespace ttnn::operations::experimental::ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp
index 455149d4ac2..350c5c60b17 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp
@@ -6,7 +6,7 @@
 
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include "cpp/ttnn/global_semaphore.hpp"
 
 namespace ttnn {
 namespace operations::experimental::ccl {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp
index 23d04509c78..b6eb3fa8949 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.cpp
@@ -7,11 +7,11 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/experimental/ccl/all_gather_async/all_gather_async.hpp"
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/distributed/types.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include "cpp/ttnn/global_semaphore.hpp"
 
 namespace ttnn::operations::experimental::ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp
index 62b69700584..439abb0d524 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.cpp
@@ -4,9 +4,9 @@
 
 #include "all_gather_async_op.hpp"
 #include "ttnn/operations/math.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include "cpp/ttnn/global_semaphore.hpp"
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 #include "ttnn/tensor/tensor_utils.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp
index c0d1ae427eb..7f59b76dedd 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp
@@ -5,17 +5,17 @@
 #pragma once
 
 #include <cstdint>
-#include "common/core_coord.hpp"
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
 #include "ttnn/operations/ccl/ccl_op_fusion.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include <tt-metalium/global_semaphore.hpp>
+#include "cpp/ttnn/global_semaphore.hpp"
 
 #include "ttnn/run_operation.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp
index 1dd6e8293c9..8b9bd0a5fb8 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_program.cpp
@@ -4,25 +4,25 @@
 ///
 #include <algorithm>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/operations/experimental/ccl/all_gather_async/device/all_gather_async_op.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/command_backend_runtime_args_overrider.hpp"
 #include <sstream>
 #include <type_traits>
 #include <ranges>
@@ -208,12 +208,6 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers(
     );
 
     // KERNEL CREATION
-    const auto& worker_defines = op_config.emit_worker_defines();
-    static const std::string& sender_kernel_reader_path =
-        "ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp";
-    static const std::string& sender_kernel_writer_path =
-        "ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp";
-
     KernelHandle worker_sender_reader_kernel_id =
         ttnn::ccl::worker_detail::generate_multi_command_stream_kernel_ct_args(
             program,
@@ -261,7 +255,7 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers(
     std::unordered_map<CoreCoord, ttnn::ccl::tensor_address_runtime_args_overrider> writer_rt_args_overrider_map;
 
     for (std::size_t link = 0; link < num_links; link++) {
-        CoreCoord core = {num_workers_per_link - 1, link};
+        CoreCoord core = sender_worker_cores[link];
         if (link == 0) {
             // drain sync core is the first worker core
             drain_sync_core = device->worker_core_from_logical_core(core);
@@ -334,16 +328,12 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers(
         // 2, mcast the semaphore to all dest for teardown
         writer_cmd_stream.push_back(ttnn::ccl::cmd::uops::fabric_multicast_semaphore_inc(
             &semaphore, ttnn::ccl::cmd::CclCommandAtomicInc{1}, drain_sync_core.x, drain_sync_core.y, mcast_dest_args));
-        if (!enable_async_output_tensor) {
+        bool wait_for_semaphore = !enable_async_output_tensor && link == 0;
+        if (wait_for_semaphore) {
             // 3, wait for n_chip*num_links number of semaphore at teardown semaphore address for first chip, and
             // n_chip*num_links+1 for other chips
             writer_cmd_stream.push_back(ttnn::ccl::cmd::uops::local_semaphore_wait(
-                &semaphore,
-                is_first_chip ? ring_size * num_links : ring_size * num_links + !enable_persistent_fabric_mode));
-        }
-
-        bool generate_teardown_commands = !enable_persistent_fabric_mode && link == 0;
-        if (generate_teardown_commands) {
+                &semaphore, is_first_chip ? ring_size * num_links : ring_size * num_links + 1));
             // 4, send semaphore unicast to forward device except for the last chip
             if (!is_last_chip) {
                 writer_cmd_stream.push_back(ttnn::ccl::cmd::uops::fabric_unicast_semaphore_inc(
@@ -353,6 +343,9 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers(
                     drain_sync_core.y,
                     ttnn::ccl::cmd::UnicastCommandDestArgs{1, true}));
             }
+        }
+        bool generate_teardown_commands = !enable_persistent_fabric_mode && link == 0;
+        if (generate_teardown_commands) {
             // 5, increment the termination semaphore for local device for local teardown only for the drain sync core
             auto termination_infos = local_fabric_handle->generate_local_chip_fabric_termination_infos(device);
             for (auto& info : termination_infos) {
@@ -362,6 +355,9 @@ operation::ProgramWithCallbacks all_gather_async_multi_core_with_workers(
                 writer_cmd_stream.push_back(ttnn::ccl::cmd::uops::local_chip_noc_absolute_address_semaphore_inc(
                     info.edm_noc_x, info.edm_noc_y, info.termination_addr, 1));
             }
+        }
+        bool reset_semaphore = generate_teardown_commands || (!enable_async_output_tensor && link == 0);
+        if (reset_semaphore) {
             // 6. (drain sync core) reset semaphore to 0
             writer_cmd_stream.push_back(ttnn::ccl::cmd::uops::local_core_semaphore_set(&semaphore, 0));
         }
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.hpp
index 158be37c972..4953f7c5ed4 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.hpp
@@ -5,9 +5,9 @@
 #pragma once
 
 #include "ttnn/decorators.hpp"
-#include "common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp"
-#include "ttnn/cpp/ttnn/distributed/api.hpp"
+#include "cpp/ttnn/distributed/api.hpp"
 
 namespace ttnn {
 namespace operations::experimental::ccl {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul_pybind.cpp
index 751ac0e99ef..af8df035ceb 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
index b52a002fe7b..726fa0aa9b6 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
@@ -2,17 +2,17 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp"
 
 /* All Gather Matmul fusion includes */
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
-#include "ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp"
-#include "ttnn/cpp/ttnn/operations/matmul/matmul.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
+#include "cpp/ttnn/operations/matmul/device/matmul_op.hpp"
+#include "cpp/ttnn/operations/matmul/matmul.hpp"
 
 namespace ttnn {
 namespace experimental {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp
index 22f53447d47..1b7093f3e16 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp
@@ -5,12 +5,12 @@
 #pragma once
 
 #include <cstdint>
-#include "common/core_coord.hpp"
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
 
@@ -21,8 +21,8 @@
 #include <algorithm>
 
 /* Fusion includes */
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
-#include "ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
+#include "cpp/ttnn/operations/matmul/device/matmul_op.hpp"
 #include "ttnn/operations/ccl/ccl_op_fusion.hpp"
 
 namespace ttnn {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/kernels/datacopy.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/kernels/datacopy.cpp
index edbfef48a32..0ef0b5d4c60 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/kernels/datacopy.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/kernels/datacopy.cpp
@@ -4,10 +4,10 @@
 
 #include <cstdint>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+#include "cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_ring_gather_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_edm_utils.hpp"
+#include "cpp/ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
 #include "debug/dprint.h"
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp
index 18aa4120ff6..38b8dc568f1 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp
@@ -4,22 +4,22 @@
 ///
 #include <algorithm>
 
-#include "tt_metal/common/core_coord.hpp"
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
 #include "ttnn/operations/ccl/shared_with_host/hetergeneous_data_structs.hpp"
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include <sstream>
 #include <type_traits>
 
-#include "ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp"
+#include "cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.hpp"
 #include "ttnn/operations/ccl/ccl_op_fusion.hpp"
 #include "ttnn/operations/matmul/device/matmul_op.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce.cpp
index 667e77a643f..c40b0b8813c 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce.cpp
@@ -4,7 +4,7 @@
 
 #include "all_reduce.hpp"
 
-#include "ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.hpp"
+#include "cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.hpp"
 
 namespace ttnn::operations::experimental::ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce.hpp
index 3d7181a304b..5365f8a6501 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce.hpp
@@ -8,7 +8,7 @@
 
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_host_types.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_host_types.hpp"
 
 namespace ttnn {
 namespace operations {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce_pybind.cpp
index ff5005a110a..8d684041efd 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/ccl/ccl_host_datastructures.hpp"
 #include "ttnn/operations/experimental/ccl/all_reduce/all_reduce.hpp"
 #include "ttnn/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp
index 363a75c6bba..5301363a2d0 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp
@@ -7,7 +7,7 @@
 #include "ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp"
 #include "ttnn/operations/ccl/all_gather/all_gather.hpp"
 #include "ttnn/operations/ccl/all_gather/device/all_gather_op.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/data_movement/reshape_view/reshape.hpp"
 #include <cstdint>
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/ccl_experimental_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/ccl_experimental_pybind.cpp
index 02d57b213fd..94fa55d6bf1 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/ccl_experimental_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/ccl_experimental_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/experimental/ccl/ccl_experimental_pybind.hpp"
+#include "cpp/ttnn/operations/experimental/ccl/ccl_experimental_pybind.hpp"
 #include "ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul_pybind.hpp"
 #include "ttnn/operations/experimental/ccl/all_reduce/all_reduce_pybind.hpp"
 #include "ttnn/operations/experimental/ccl/all_gather_async/all_gather_async_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp
index a1a38a23eda..28e163bfa77 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp"
-#include "sub_device/sub_device_types.hpp"
-#include "tt_metal/host_api.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include "cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp"
+#include <tt-metalium/sub_device_types.hpp>
+#include <tt-metalium/host_api.hpp>
+#include "cpp/ttnn/global_semaphore.hpp"
 
 #include <ranges>
 #include <algorithm>
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp
index e456bd26a42..bfc9789c5cc 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp
@@ -4,11 +4,11 @@
 
 #pragma once
 
-#include "sub_device/sub_device_types.hpp"
+#include <tt-metalium/sub_device_types.hpp>
 #include "ttnn/operations/ccl/ccl_common.hpp"
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include "cpp/ttnn/global_semaphore.hpp"
 
 namespace ttnn {
 struct ReduceScatterAsync {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_program.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_program.cpp
index 1b06fdb9b3c..1c856c67cd5 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_program.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_program.cpp
@@ -9,28 +9,28 @@
 #include <vector>
 #include <array>
 #include <ranges>
-#include "common/core_coord.hpp"
-#include "common/logger.hpp"
-#include "device/device.hpp"
-#include "kernels/kernel_types.hpp"
-#include "span.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/kernel_types.hpp>
+#include <tt-metalium/span.hpp>
+#include "cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/ccl_worker_builder.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operation.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/command_lowering.hpp"
 
 // For reduction op
 #include "ttnn/operations/ccl/common/uops/ccl_host_commands.hpp"
 #include "ttnn/operations/eltwise/binary/common/binary_op_types.hpp"
 #include "ttnn/operations/eltwise/binary/common/binary_op_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/tt_stl/overloaded.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_common.hpp"
+#include "cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
+#include "cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.hpp"
+#include "cpp/ttnn/operations/ccl/common/host/ccl_command_stream_builders.hpp"
+#include <tt-metalium/global_semaphore.hpp>
+#include <tt-metalium/overloaded.hpp>
 
 /*
  * This file contains the program factory for reduce scatter operation implemented on line (and soon, ring) topologies.
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp
index 72deed4fd4f..ac044afafd7 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.cpp
@@ -4,8 +4,8 @@
 
 #include "reduce_scatter.hpp"
 
-#include "ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include "cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/device/reduce_scatter_async_op.hpp"
+#include "cpp/ttnn/global_semaphore.hpp"
 
 namespace ttnn::operations::experimental::ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp
index fde121e22c4..70989cbebfc 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp
@@ -9,8 +9,8 @@
 
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 
-#include "ttnn/cpp/ttnn/operations/ccl/ccl_host_types.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include "cpp/ttnn/operations/ccl/ccl_host_types.hpp"
+#include "cpp/ttnn/global_semaphore.hpp"
 
 namespace ttnn {
 namespace operations {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp
index 617a0e7fd0e..c3d88c4d4e3 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter_pybind.cpp
@@ -7,10 +7,10 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/experimental/ccl/reduce_scatter_async/reduce_scatter.hpp"
 #include "ttnn/types.hpp"
-#include "ttnn/cpp/ttnn/global_semaphore.hpp"
+#include "cpp/ttnn/global_semaphore.hpp"
 
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw_pybind.cpp
index f0b2c6b5a6e..a605c9655c8 100644
--- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/convert_to_chw_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "convert_to_chw.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::experimental::cnn::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.cpp b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.cpp
index 2df102dda42..091781542ff 100644
--- a/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/cnn/convert_to_chw/device/convert_to_chw_op.cpp
@@ -5,7 +5,7 @@
 #include "convert_to_chw_op.hpp"
 
 #include "convert_to_chw_program_factory.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 namespace ttnn::operations::experimental::cnn {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp
index f1f453e2288..8e399c4c76a 100644
--- a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast.cpp
@@ -6,7 +6,7 @@
 #include "ttnn/run_operation.hpp"
 #include "ttnn/decorators.hpp"
 #include "typecast.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp"
+#include "cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.hpp"
 
 namespace ttnn::operations::experimental::copy {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.cpp
index 193c4fe9186..6540a57ec8f 100644
--- a/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.cpp
@@ -5,7 +5,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "typecast_pybind.hpp"
 #include "typecast.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_device_operation.cpp
index a0b1103db86..ec94e48afdb 100644
--- a/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_device_operation.cpp
@@ -5,8 +5,8 @@
 #include "dropout_device_operation.hpp"
 
 #include <magic_enum/magic_enum.hpp>
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/tensor/tensor_utils.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_program_factory.cpp
index 089f00bb91d..3af930b6288 100644
--- a/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/dropout/device/dropout_program_factory.cpp
@@ -7,10 +7,10 @@
 #include "dropout_program_factory.hpp"
 
 #include "dropout_device_operation_types.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace {
 constexpr auto kWriterKernelPath =
diff --git a/ttnn/cpp/ttnn/operations/experimental/dropout/dropout_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/dropout/dropout_pybind.cpp
index f28c6d1525d..65265ef4727 100644
--- a/ttnn/cpp/ttnn/operations/experimental/dropout/dropout_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/dropout/dropout_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/dropout/dropout.hpp"
 #include "ttnn/operations/experimental/dropout/dropout_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp
index 6bd0138b514..c6895a187dc 100644
--- a/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp
@@ -30,9 +30,9 @@
 #include "ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk_pybind.hpp"
 #include "ttnn/operations/experimental/transformer/rotate_half/rotate_half_pybind.hpp"
 #include "ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.hpp"
-#include "ttnn/cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.hpp"
-#include "ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.hpp"
-#include "ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.hpp"
+#include "cpp/ttnn/operations/experimental/copy/typecast/typecast_pybind.hpp"
+#include "cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.hpp"
+#include "cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.hpp"
 #include "ttnn/operations/experimental/ccl/ccl_experimental_pybind.hpp"
 #include "ttnn/operations/experimental/plusone/plusone_pybind.hpp"
 #include "ttnn/operations/experimental/dropout/dropout_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.cpp
index 1a190827e79..8684b752712 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/matmul/attn_matmul/attn_matmul_pybind.hpp"
 #include "ttnn/operations/experimental/matmul/attn_matmul/attn_matmul.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.cpp
index 961009a4a78..936a3676f1e 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "attn_matmul_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp
index 72dad440ba0..02d5478481e 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "attn_matmul_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
 namespace ttnn::operations::experimental::matmul {
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.cpp
index f1bae43f41c..cd5607d6457 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "group_attn_matmul_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp
index 012c3ae9d54..3feabf2f37d 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "group_attn_matmul_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
 namespace ttnn::operations::experimental::matmul {
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp
index 6957437e7fd..96ef379d6a9 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.hpp"
 #include "ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp
index 23e596dd78a..5a278f3e281 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/cb_utils.hpp"
 #include "paged_cache_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp
index e232435aeca..440d2fc9f61 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/cb_utils.hpp"
 #include "paged_cache_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::paged_cache::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.cpp
index c9c0837237c..61dd7babe14 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/cb_utils.hpp"
 #include "paged_cache_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp
index 84110782f2a..ddb14593078 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/cb_utils.hpp"
 #include "paged_cache_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::paged_cache::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.cpp
index 3406fe083d1..c3673d070e3 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.cpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/cb_utils.hpp"
 #include "paged_cache_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp
index 12201a8d842..8cb01f07632 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/cb_utils.hpp"
 #include "paged_cache_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::paged_cache::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/paged_cache_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/paged_cache_pybind.cpp
index 344195ea1c1..0aaf01aefc2 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/paged_cache_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/paged_cache_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/paged_cache/paged_cache.hpp"
 #include "ttnn/operations/experimental/paged_cache/paged_cache_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp
index cdfeca9320a..3d785838f24 100644
--- a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp
@@ -4,10 +4,10 @@
 #include <algorithm>
 
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operation.hpp"
 
 namespace ttnn::operations::experimental::detail {
diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone_pybind.cpp
index d81982acc7a..cab31de43a1 100644
--- a/ttnn/cpp/ttnn/operations/experimental/plusone/plusone_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/plusone/plusone_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/plusone/plusone.hpp"
 #include "ttnn/operations/experimental/plusone/plusone_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp
index 205198c1d31..ef309401445 100644
--- a/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp
@@ -8,7 +8,7 @@
 #include "ttnn/operations/eltwise/unary/unary.hpp"
 #include "ttnn/operations/experimental/reduction/argmax/argmax.hpp"
 #include "ttnn/operations/creation.hpp"
-#include "ttnn/cpp/ttnn/operations/eltwise/ternary/where.hpp"
+#include "cpp/ttnn/operations/eltwise/ternary/where.hpp"
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 #include "ttnn/operations/core/core.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax_pybind.hpp
index 5febaf77051..b212bee0709 100644
--- a/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/reduction/argmax/argmax.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.cpp
index aad2ab9d736..89d87fc0b20 100644
--- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.cpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/run_operation.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::experimental::reduction::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.hpp
index 39ce2314c60..9f885507fef 100644
--- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.hpp
@@ -2,12 +2,12 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/run_operation.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::experimental::reduction::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/kernels/reader_reduce_nc.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/kernels/reader_reduce_nc.cpp
index 1ca649d52dd..bcf9f13d746 100644
--- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/kernels/reader_reduce_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/kernels/reader_reduce_nc.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 
 inline uint32_t get_read_tile_id(uint32_t output_tile_id, uint32_t reduce_tile_size, uint32_t inner_tile_size) {
     return ((output_tile_id / inner_tile_size) * reduce_tile_size) + (output_tile_id % inner_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp
index 594b141f896..e0dd667fdb4 100644
--- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc.hpp"
 #include "ttnn/operations/experimental/reduction/fast_reduce_nc/fast_reduce_nc_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp
index 4351c9a91e2..979fd21bc19 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp
@@ -5,7 +5,7 @@
 #include "hc_sum_reduce_program_factory.hpp"
 
 #include "ttnn/common/constants.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::ssm::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/kernels/reader_ssm_1d_sum_reduce.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/kernels/reader_ssm_1d_sum_reduce.cpp
index 01f3c6e5e01..eaede13be94 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/kernels/reader_ssm_1d_sum_reduce.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/kernels/reader_ssm_1d_sum_reduce.cpp
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce_pybind.cpp
index 5614fa9818f..2431302e48e 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "hc_sum_reduce.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::experimental::ssm::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.cpp
index 97824e9ef47..43279187b4f 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "prefix_scan_op.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "prefix_scan_program_factory.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan_pybind.cpp
index 8f8f13296b1..8a6f8506eb9 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "prefix_scan.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::experimental::ssm::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.cpp
index abacf286006..63bf094cff2 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.cpp
@@ -5,7 +5,7 @@
 #include "repeat_and_interleave_eltwise_mul_op.hpp"
 
 #include "repeat_and_interleave_eltwise_mul_program_factory.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp
index 7c2041159b0..579688696dc 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp
@@ -5,7 +5,7 @@
 #include "repeat_and_interleave_eltwise_mul_program_factory.hpp"
 
 #include "ttnn/common/constants.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::ssm::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul_pybind.cpp
index cdfee03736d..be99bd40725 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "repeat_and_interleave_eltwise_mul.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::experimental::ssm::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads_pybind.hpp
index 1051039e0ea..1c59b9091ae 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/concatenate_heads/concatenate_heads.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_program_factory.hpp
index af29e19487b..799bbea9f8b 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/concatenate_heads/device/concatenate_heads_program_factory.hpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 namespace ttnn::operations::experimental::transformer::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads_pybind.cpp
index d683e8d81d5..483597182ab 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.hpp"
 #include "ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp
index 26db453b382..0ab5668d4a3 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "create_qkv_heads_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.hpp
index afb9049cd80..4b79deafdd2 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.hpp
@@ -7,7 +7,7 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/run_operation.hpp"
 #include "ttnn/operations/core/core.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp
index b37b6ae1686..c26c81808fa 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "create_qkv_heads_device_operation.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::constants;
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors_pybind.cpp
index fafbb387b9d..99b8a07c7b7 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors.hpp"
 #include "ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/create_qkv_heads_from_separate_tensors_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp
index 44ec663188a..5ff74e80358 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "create_qkv_heads_from_separate_tensors_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.hpp
index e0f235868f3..9321193353d 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.hpp
@@ -7,7 +7,7 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/run_operation.hpp"
 #include "ttnn/operations/core/core.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp
index 7943d27fadc..9f120216341 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "create_qkv_heads_from_separate_tensors_device_operation.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::constants;
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp
index 66c9f29f799..eb2b11c2912 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "nlp_concat_heads_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads_pybind.cpp
index a9056b2f40b..a6988201ac4 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/nlp_concat_heads/nlp_concat_heads.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp
index 198a266220a..55cf87a2b85 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "nlp_concat_heads_decode_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp
index 911ade90c32..52359bf572c 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "nlp_concat_heads_decode_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode_pybind.cpp
index 593dd48433b..c0d333dabcc 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/nlp_concat_heads_decode/nlp_concat_heads_decode.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.cpp
index 39af1c82814..2a251e4ec7d 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_device_operation.cpp
@@ -4,7 +4,7 @@
 
 #include "nlp_create_qkv_heads_device_operation.hpp"
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp
index 4fd1479f8bc..07f10ee4fec 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "nlp_create_qkv_heads_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.cpp
index 97c7f45c70c..eb3d0b5ea65 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.hpp"
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/kernels/reader_tm_tile_layout_nlp_create_qkv_heads_decode.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/kernels/reader_tm_tile_layout_nlp_create_qkv_heads_decode.cpp
index fe14150b04a..6e79240f49b 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/kernels/reader_tm_tile_layout_nlp_create_qkv_heads_decode.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/kernels/reader_tm_tile_layout_nlp_create_qkv_heads_decode.cpp
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 using namespace tt::constants;
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/kernels/reader_tm_tile_layout_nlp_create_qkv_heads_decode_on_subcoregrids.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/kernels/reader_tm_tile_layout_nlp_create_qkv_heads_decode_on_subcoregrids.cpp
index d0d9f1e9681..538f7e03560 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/kernels/reader_tm_tile_layout_nlp_create_qkv_heads_decode_on_subcoregrids.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/kernels/reader_tm_tile_layout_nlp_create_qkv_heads_decode_on_subcoregrids.cpp
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 using namespace tt::constants;
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.cpp
index 9721aca76fe..61d93bc40bf 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "nlp_create_qkv_heads_decode_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.hpp
index 92f4c99d8ae..4521f544e0d 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_device_operation.hpp
@@ -7,7 +7,7 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/run_operation.hpp"
 #include "ttnn/operations/core/core.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp
index b1f2603038c..91ee09e3a2c 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "nlp_create_qkv_heads_decode_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::constants;
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.cpp
index 620d12efd41..3af8118ac9b 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp
index a1c890bc0bf..c9fef9c9354 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp
@@ -4,7 +4,7 @@
 
 #include "nlp_create_qkv_heads_falcon7b_device_operation.hpp"
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_program_factory.cpp
index e423fb110e4..50cb20707b4 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_program_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "nlp_create_qkv_heads_falcon7b_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp
index 1b6598c4c78..320914c1388 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.hpp"
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp
index 21c98bd9d05..fb30fb79603 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp
@@ -4,7 +4,7 @@
 
 #include "nlp_create_qkv_heads_segformer_device_operation.hpp"
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp
index dc9d3edaaa2..e74c37744d7 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "nlp_create_qkv_heads_segformer_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp
index f00f41f5437..c0c16fe9335 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp"
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp
index 7194571cc79..2f31854af2a 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp
@@ -4,7 +4,7 @@
 
 #include "nlp_create_qkv_heads_vit_device_operation.hpp"
 
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp
index bf37409f98e..ac2cd6e2b05 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "nlp_create_qkv_heads_vit_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 namespace ttnn::operations::experimental::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp
index d2ea215331c..00ed867cec6 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp"
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp
index da62f80716f..bced6ddd901 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "nlp_kv_cache_load_slice_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp
index 0bcc9238728..6cee40bca52 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 #include "nlp_kv_cache_load_slice_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/data_movement/slice/device/slice_op.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.cpp
index 98f44ce0a56..8f979d12e4f 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.cpp
index 7c4def942cb..a232fe49fc0 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_device_operation.cpp
@@ -5,11 +5,11 @@
 #include "rotary_embedding_device_operation.hpp"
 #include "rotary_embedding_program_factory.hpp"
 
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::constants;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
index 39c43241ace..a7b220b368f 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
@@ -3,15 +3,15 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "rotary_embedding_program_factory.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
 // We pull RotaryEmbedding from it to get token_idx from an operation
 // this is a circulas dependency and should be fixed
 #include "rotary_embedding_device_operation.hpp"
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace tt {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.hpp
index 430a8242d7c..13a97c13668 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.hpp
@@ -8,7 +8,7 @@
 
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/run_operation.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
 #include "ttnn/decorators.hpp"
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding_pybind.cpp
index 90661225f2f..d411c09e343 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "rotary_embedding.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp
index a6a3fabb43b..7e23a7e64a6 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_device_operation.cpp
@@ -5,8 +5,8 @@
 #include "rotary_embedding_llama_device_operation.hpp"
 #include "rotary_embedding_llama_program_factory.hpp"
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace tt {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp
index b4c2c4ba253..811869ffcc0 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp
@@ -4,11 +4,11 @@
 
 #include <vector>
 #include "rotary_embedding_llama_program_factory.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace tt {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama_pybind.cpp
index 09fb8b5fd0e..67b8c829df1 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "rotary_embedding_llama.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_device_operation.cpp
index 906651d275d..b2a659801d0 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_device_operation.cpp
@@ -5,8 +5,8 @@
 #include "rotary_embedding_llama_fused_qk_device_operation.hpp"
 #include "rotary_embedding_llama_fused_qk_program_factory.hpp"
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace tt {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.cpp
index aad99767912..3f086382617 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.cpp
@@ -4,11 +4,11 @@
 
 #include <vector>
 #include "rotary_embedding_llama_fused_qk_program_factory.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace tt {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk_pybind.cpp
index 26a90ca0765..aa4c11b1104 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "rotary_embedding_llama_fused_qk.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.cpp
index 31835899380..dd7a0eb1021 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.cpp
@@ -4,9 +4,9 @@
 
 #include "rotate_half_program_factory.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half_pybind.cpp
index b3daa14ff3f..a3dfdeffb77 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "rotate_half.hpp"
 
 namespace ttnn::operations::experimental::transformer {
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp
index 329f20a25f1..221c14a9987 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 namespace ttnn::operations::experimental::transformer::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.hpp
index 77cbadc7f2a..0341a2f11b5 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/full/device/full_program_factory.cpp b/ttnn/cpp/ttnn/operations/full/device/full_program_factory.cpp
index 883411de680..9263a09d382 100644
--- a/ttnn/cpp/ttnn/operations/full/device/full_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/full/device/full_program_factory.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "full_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp"
+#include <tt-metalium/work_split.hpp>
+#include "cpp/ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 using namespace tt;
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/full/full_pybind.cpp b/ttnn/cpp/ttnn/operations/full/full_pybind.cpp
index a238f47a96b..e8392df7449 100644
--- a/ttnn/cpp/ttnn/operations/full/full_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/full/full_pybind.cpp
@@ -9,7 +9,7 @@
 
 #include "full.hpp"
 #include "pybind11/cast.h"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/full/device/full_device_operation.hpp"
 
 namespace ttnn::operations::full {
diff --git a/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp b/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp
index dfc45b23526..0e1bd2c7df1 100644
--- a/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp
@@ -4,11 +4,11 @@
 
 #include <cstdint>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "full_like_device_operation.hpp"
-#include "host_api.hpp"
-#include "impl/buffers/circular_buffer_types.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/circular_buffer_types.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/tensor/types.hpp"
 
 namespace ttnn::operations::full_like {
diff --git a/ttnn/cpp/ttnn/operations/full_like/full_like_pybind.cpp b/ttnn/cpp/ttnn/operations/full_like/full_like_pybind.cpp
index 1d2fcb1346d..55dd6abefb3 100644
--- a/ttnn/cpp/ttnn/operations/full_like/full_like_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/full_like/full_like_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "full_like.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/full_like/device/full_like_device_operation.hpp"
 
 namespace py = pybind11;
diff --git a/ttnn/cpp/ttnn/operations/functions.hpp b/ttnn/cpp/ttnn/operations/functions.hpp
index 68f2c60a496..9d4493c5446 100644
--- a/ttnn/cpp/ttnn/operations/functions.hpp
+++ b/ttnn/cpp/ttnn/operations/functions.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include <common/math.hpp>
+#include <tt-metalium/math.hpp>
 #include <optional>
 #include <random>
 #include <ttnn/tensor/host_buffer/functions.hpp>
@@ -13,7 +13,7 @@
 #include <ttnn/tensor/tensor_utils.hpp>
 #include <ttnn/tensor/types.hpp>
 #include <ttnn/tensor/tensor_impl.hpp>
-#include "ttnn/cpp/ttnn/common/constants.hpp"
+#include "cpp/ttnn/common/constants.hpp"
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
index edc31ffe24e..060e70ad8a5 100644
--- a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "host_api.hpp"
-#include "impl/buffers/circular_buffer_types.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/circular_buffer_types.hpp>
 #include "index_fill_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/tensor/types.hpp"
 
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp
index 07dee98aaa5..55be1cc8d4e 100644
--- a/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/reader_index_fill.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 typedef union {
     float f;
     uint32_t u;
diff --git a/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp
index 64275bbf3a5..faa8772413d 100644
--- a/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/index_fill/index_fill_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "index_fill.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/index_fill/device/index_fill_device_operation.hpp"
 
 namespace py = pybind11;
diff --git a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op.cpp b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op.cpp
index d77a8b3b3bf..583661402e3 100644
--- a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op.cpp
+++ b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op.hpp"
+#include "cpp/ttnn/operations/kv_cache/device/update_cache_op.hpp"
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op.hpp b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op.hpp
index 7843644fae6..099ee0ec0b8 100644
--- a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op.hpp
+++ b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op.hpp
@@ -6,7 +6,7 @@
 
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/run_operation.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
 namespace ttnn::operations::kv_cache {
diff --git a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
index 3d3c75c26a2..b1843dffba5 100644
--- a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
@@ -5,10 +5,10 @@
 #include <stdint.h>
 
 #include "update_cache_op.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/kv_cache/kv_cache_pybind.cpp b/ttnn/cpp/ttnn/operations/kv_cache/kv_cache_pybind.cpp
index 40002fbe752..752e56037eb 100644
--- a/ttnn/cpp/ttnn/operations/kv_cache/kv_cache_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/kv_cache/kv_cache_pybind.cpp
@@ -7,7 +7,7 @@
 
 #include "kv_cache.hpp"
 #include "kv_cache_pybind.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/types.hpp"
 
 namespace py = pybind11;
diff --git a/ttnn/cpp/ttnn/operations/loss/loss_pybind.cpp b/ttnn/cpp/ttnn/operations/loss/loss_pybind.cpp
index 3ce8b8c65e5..ea67e97b9b6 100644
--- a/ttnn/cpp/ttnn/operations/loss/loss_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/loss/loss_pybind.cpp
@@ -10,8 +10,8 @@
 
 #include "loss.hpp"
 #include "loss_types.hpp"
-#include "ttnn/cpp/pybind11/export_enum.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/export_enum.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/ttnn/operations/math.hpp b/ttnn/cpp/ttnn/operations/math.hpp
index bcef357778d..4c3f7571162 100644
--- a/ttnn/cpp/ttnn/operations/math.hpp
+++ b/ttnn/cpp/ttnn/operations/math.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 namespace tt::tt_metal {
 
 template <typename T>
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core.cpp
index 98416ce76ff..3b24542409e 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core.cpp
@@ -9,7 +9,7 @@
 // #include "debug/dprint.h"
 
 #ifdef FUSE_BIAS
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/reader_bmm_single_core_bias.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/reader_bmm_single_core_bias.hpp"
 #endif
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core_tilize_untilize.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core_tilize_untilize.cpp
index 4104d682d18..0c332e71200 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core_tilize_untilize.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core_tilize_untilize.cpp
@@ -8,7 +8,7 @@
 // #include "debug/dprint.h"
 
 #ifdef FUSE_BIAS
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/reader_bmm_single_core_bias.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/reader_bmm_single_core_bias.hpp"
 #endif
 
 /**
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp
index e10b748f7aa..21372676fcf 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp
@@ -6,7 +6,7 @@
 
 #include "dataflow_api.h"
 #include "hostdevcommon/common_values.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
 
 void kernel_main() {
     uint32_t rt_args_idx = 0;
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp
index 2e1d9842f8f..f527579dc68 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp
@@ -6,7 +6,7 @@
 
 #include "dataflow_api.h"
 #include "hostdevcommon/common_values.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
 
 void kernel_main() {
     constexpr bool core_has_output_block_work = (bool)get_compile_time_arg_val(0);
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
index 70464ae2c57..da41785a087 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp
@@ -6,7 +6,7 @@
 
 #include "dataflow_api.h"
 #include "hostdevcommon/common_values.hpp"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
 
 void kernel_main() {
     // READER
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
index e328116baaa..abd07afefce 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
@@ -9,9 +9,9 @@
 #include <numeric>
 #include <optional>
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/run_operation.hpp"
 #include "ttnn/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp
index ca716aabd19..60ca189235b 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/matmul/device/matmul_op.hpp"
 
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
index da5219c282f..dd40a3c3915 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
@@ -6,11 +6,11 @@
 #include <utility>
 
 #include "hostdevcommon/common_values.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 #include "ttnn/operations/matmul/device/matmul_op.hpp"
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp
index 3fe575d1bca..3db1a19d70b 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp
@@ -6,10 +6,10 @@
 #include <utility>
 
 #include "hostdevcommon/common_values.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/matmul/device/matmul_op.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
index 71ed9fe748d..73d78769e51 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
@@ -6,11 +6,11 @@
 #include <utility>
 
 #include "hostdevcommon/common_values.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 #include "ttnn/operations/matmul/device/matmul_op.hpp"
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
index 36b694b042b..0d37c481748 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/matmul/device/matmul_op.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp
index 711216a3d31..3c1bf91434d 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/matmul/device/matmul_op.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
index 2a2c4324b4e..832027888cd 100644
--- a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
+++ b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/command_queue.hpp>
 #include "ttnn/operations/data_movement/bcast/bcast.hpp"
 #include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp"
 #include "ttnn/operations/matmul/device/matmul_op.hpp"
diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp
index ced3a21296a..65435253324 100644
--- a/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/matmul_pybind.cpp
@@ -10,8 +10,8 @@
 #include <utility>
 
 #include "pybind11/decorators.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "ttnn/cpp/pybind11/json_class.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include "cpp/pybind11/json_class.hpp"
 #include "ttnn/operations/matmul/matmul.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/moreh_abs_pow_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/moreh_abs_pow_kernel.cpp
index cd752419f59..d25ae5a08eb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/moreh_abs_pow_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/moreh_abs_pow_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #include "debug/dprint.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/reader_moreh_abs_pow.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/reader_moreh_abs_pow.cpp
index e938be428af..1ee1d2bfaea 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/reader_moreh_abs_pow.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/reader_moreh_abs_pow.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_program_factory.cpp
index e923f3c3c48..b177278d903 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_abs_pow_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_abs_pow {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/moreh_adam.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/moreh_adam.cpp
index 2fed5e3c98a..fed8e7ad995 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/moreh_adam.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/moreh_adam.cpp
@@ -10,7 +10,7 @@
 #include "compute_kernel_api/eltwise_unary/recip.h"
 #include "compute_kernel_api/eltwise_unary/sqrt.h"
 #include "compute_kernel_api/tile_move_copy.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 #ifdef FP32_DEST_ACC_EN
 #define WITH_FP32_DEST_ACC(x) x
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp
index 1f6894f1552..cdbb86cf619 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "moreh_adam_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/moreh_adamw.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/moreh_adamw.cpp
index 4d20af27664..85d2f3717af 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/moreh_adamw.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/moreh_adamw.cpp
@@ -10,7 +10,7 @@
 #include "compute_kernel_api/eltwise_unary/recip.h"
 #include "compute_kernel_api/eltwise_unary/sqrt.h"
 #include "compute_kernel_api/tile_move_copy.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/reader_moreh_adamw.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/reader_moreh_adamw.cpp
index 581e0e5e894..a023d357cc2 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/reader_moreh_adamw.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/reader_moreh_adamw.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/multi_core_program_factory.cpp
index c070ef0dc7a..134c7444d5d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/multi_core_program_factory.cpp
@@ -5,7 +5,7 @@
 #include <optional>
 
 #include "moreh_adamw_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_adamw {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/moreh_adamw_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/moreh_adamw_pybind.cpp
index 1c1d7f6715f..5f916782a9d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/moreh_adamw_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/moreh_adamw_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/moreh/moreh_adamw/moreh_adamw.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_program_factory.cpp
index 4e9341849da..f3b6c3c428b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_arange_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_arange {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_bmm/moreh_bmm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_bmm/moreh_bmm.cpp
index 3d487883d11..5d2f82cb408 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_bmm/moreh_bmm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_bmm/moreh_bmm.cpp
@@ -4,7 +4,7 @@
 
 #include "moreh_bmm.hpp"
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_matmul/moreh_matmul.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_matmul/moreh_matmul.hpp"
 
 namespace ttnn::operations::moreh::moreh_bmm {
 Tensor MorehBMM::invoke(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_bmm/moreh_bmm.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_bmm/moreh_bmm.hpp
index 93f53062a03..3d1e028a096 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_bmm/moreh_bmm.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_bmm/moreh_bmm.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
+#include "cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/decorators.hpp"
 
 namespace ttnn::operations::moreh::moreh_bmm {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_bmm_backward/moreh_bmm_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_bmm_backward/moreh_bmm_backward.cpp
index c7ce30906ab..f3d2cc7b02e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_bmm_backward/moreh_bmm_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_bmm_backward/moreh_bmm_backward.cpp
@@ -4,7 +4,7 @@
 
 #include "moreh_bmm_backward.hpp"
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_matmul/moreh_matmul.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_matmul/moreh_matmul.hpp"
 
 namespace ttnn::operations::moreh::moreh_bmm_backward {
 std::vector<std::optional<Tensor>> MorehBMMBackward::invoke(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_bmm_backward/moreh_bmm_backward.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_bmm_backward/moreh_bmm_backward.hpp
index 62081208cd6..4faa08a917a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_bmm_backward/moreh_bmm_backward.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_bmm_backward/moreh_bmm_backward.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
+#include "cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/decorators.hpp"
 
 namespace ttnn::operations::moreh::moreh_bmm_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm.cpp
index fc52354960f..7a74e7d247f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm.cpp
@@ -6,13 +6,13 @@
 
 #include <optional>
 
-#include "common/base_types.hpp"
-#include "common/constants.hpp"
+#include <tt-metalium/base_types.hpp>
+#include <tt-metalium/constants.hpp>
 #include "moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_device_operation.hpp"
 #include "moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.hpp"
 #include "moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp"
+#include "cpp/ttnn/operations/eltwise/binary/binary.hpp"
+#include "cpp/ttnn/operations/eltwise/binary/binary_composite.hpp"
 #include "ttnn/operations/creation.hpp"
 #include "ttnn/tensor/shape/shape.hpp"
 #include "ttnn/tensor/tensor.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_pybind.cpp
index 03e1edbfd62..2cef94281c3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_pybind.cpp
@@ -5,7 +5,7 @@
 #include "moreh_clip_grad_norm_pybind.hpp"
 
 #include "moreh_clip_grad_norm.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::moreh::moreh_clip_grad_norm {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/kernels/moreh_clip_grad_norm_step1_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/kernels/moreh_clip_grad_norm_step1_kernel.cpp
index 1362a988ffb..f7194600a47 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/kernels/moreh_clip_grad_norm_step1_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/kernels/moreh_clip_grad_norm_step1_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 ALWI bool need_to_do_mask_h(uint32_t tile_idx, uint32_t ht, uint32_t wt) { return (((tile_idx / wt) + 1) % ht) == 0; }
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/kernels/reader_moreh_clip_grad_norm_step1.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/kernels/reader_moreh_clip_grad_norm_step1.cpp
index 59b898fa888..563f602c84b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/kernels/reader_moreh_clip_grad_norm_step1.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/kernels/reader_moreh_clip_grad_norm_step1.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_device_operation.cpp
index 0778f8256bf..cdc55361d6e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_device_operation.cpp
@@ -4,7 +4,7 @@
 
 #include "moreh_clip_grad_norm_step1_device_operation.hpp"
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/tensor/tensor.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_program_factory.cpp
index 10897c88330..0d99ddefbd9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_program_factory.cpp
@@ -5,8 +5,8 @@
 #include <vector>
 
 #include "moreh_clip_grad_norm_step1_device_operation.hpp"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/assert.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/kernels/moreh_clip_grad_norm_step2_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/kernels/moreh_clip_grad_norm_step2_kernel.cpp
index 160f1f96044..fd91765e81b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/kernels/moreh_clip_grad_norm_step2_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/kernels/moreh_clip_grad_norm_step2_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/kernels/reader_moreh_clip_grad_norm_step2.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/kernels/reader_moreh_clip_grad_norm_step2.cpp
index 548d8c5e95f..3d6b2e51eeb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/kernels/reader_moreh_clip_grad_norm_step2.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/kernels/reader_moreh_clip_grad_norm_step2.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.cpp
index 41cadd155d1..507cd75979d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.cpp
@@ -4,7 +4,7 @@
 
 #include "moreh_clip_grad_norm_step2_device_operation.hpp"
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/tensor/tensor.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.hpp
index 01bce7483f5..b6191de303d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_device_operation.hpp
@@ -6,7 +6,7 @@
 
 #include <vector>
 
-#include "common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_program_factory.cpp
index cbc83d3e7fa..85d691a2e10 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_program_factory.cpp
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "moreh_clip_grad_norm_step2_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/kernels/moreh_clip_grad_norm_step3_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/kernels/moreh_clip_grad_norm_step3_kernel.cpp
index b0bf50fc4b8..28e56eb6192 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/kernels/moreh_clip_grad_norm_step3_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/kernels/moreh_clip_grad_norm_step3_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/kernels/reader_moreh_clip_grad_norm_step3.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/kernels/reader_moreh_clip_grad_norm_step3.cpp
index a2310362759..7ec4454fc8c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/kernels/reader_moreh_clip_grad_norm_step3.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/kernels/reader_moreh_clip_grad_norm_step3.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.cpp
index 4b2b173461f..3a486f4aad6 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.cpp
@@ -4,7 +4,7 @@
 
 #include "moreh_clip_grad_norm_step3_device_operation.hpp"
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/tensor/tensor.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.hpp
index a123a1bdd5c..65dc5df14f8 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_device_operation.hpp
@@ -6,7 +6,7 @@
 
 #include <vector>
 
-#include "common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 #include "ttnn/decorators.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_program_factory.cpp
index f78976e6c6e..16528bcee7d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_program_factory.cpp
@@ -5,8 +5,8 @@
 #include <vector>
 
 #include "moreh_clip_grad_norm_step3_device_operation.hpp"
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/assert.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/reader_moreh_cumsum_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/reader_moreh_cumsum_nc.cpp
index 3b906141a6e..a83a7d8f3e8 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/reader_moreh_cumsum_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/reader_moreh_cumsum_nc.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 inline uint32_t get_read_tile_id(uint32_t tile_id, uint32_t dim, uint32_t CHtWt, uint32_t HtWt) {
     return (dim == 0) ? (tile_id) : (tile_id / HtWt * CHtWt) + (tile_id % HtWt);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_program_factory.cpp
index 37e93db2168..4b66192b793 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_program_factory.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_cumsum_device_operation.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_cumsum {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp
index 6c6d7ae8d46..24c8b88a416 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp
@@ -4,7 +4,7 @@
 
 #include "moreh_dot_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 namespace ttnn::operations::moreh::moreh_dot {
 MorehDotOperation::SingleCore::cached_program_t MorehDotOperation::SingleCore::create(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/moreh_dot_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/moreh_dot_pybind.cpp
index 11b65e1abcc..bd52a68c79b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/moreh_dot_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/moreh_dot_pybind.cpp
@@ -9,7 +9,7 @@
 
 #include "device/moreh_dot_device_operation.hpp"
 #include "moreh_dot.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp
index bad91776feb..e50c3fa4a81 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp
@@ -5,7 +5,7 @@
 #include <cstdio>
 
 #include "moreh_dot_backward_device_operation.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_dot_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_program_factory_rm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_program_factory_rm.cpp
index bd46343f293..d879d30b311 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_program_factory_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_program_factory_rm.cpp
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "fold_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_fold {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize.cpp
index 53642a3e73c..3fb939a80f6 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize_w.cpp
index d7e8e6e8db9..ade9f74f986 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize_w.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize.cpp
index 6b73b5d4a03..e72bf996e31 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize_w.cpp
index a167dbc098e..03dbd34f3b7 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize_w.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/reader_moreh_group_norm_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/reader_moreh_group_norm_large.cpp
index cf8c12c1379..05bd2e6f24a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/reader_moreh_group_norm_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/reader_moreh_group_norm_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/reader_moreh_group_norm_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/reader_moreh_group_norm_small.cpp
index 78f987d0d68..3e737ad25bf 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/reader_moreh_group_norm_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/reader_moreh_group_norm_small.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/writer_moreh_group_norm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/writer_moreh_group_norm.cpp
index b993ff9e68c..0e8673fa16b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/writer_moreh_group_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/kernels/dataflow/writer_moreh_group_norm.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_program_factory.cpp
index 8c432a5b223..c0e5197b022 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_group_norm_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 inline uint32_t get_block_size(uint32_t num_tiles, uint32_t max_block_size) {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/kernels/dataflow/reader_moreh_group_norm_backward_gamma_beta_grad.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/kernels/dataflow/reader_moreh_group_norm_backward_gamma_beta_grad.cpp
index c66761eaa01..5fd0a6254f6 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/kernels/dataflow/reader_moreh_group_norm_backward_gamma_beta_grad.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/kernels/dataflow/reader_moreh_group_norm_backward_gamma_beta_grad.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/kernels/dataflow/writer_moreh_group_norm_backward_gamma_beta_grad.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/kernels/dataflow/writer_moreh_group_norm_backward_gamma_beta_grad.cpp
index e4ead059de9..893b40d14db 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/kernels/dataflow/writer_moreh_group_norm_backward_gamma_beta_grad.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/kernels/dataflow/writer_moreh_group_norm_backward_gamma_beta_grad.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_factory.cpp
index 4840289284c..75fd1207a93 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_group_norm_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/reader_moreh_group_norm_backward_input_grad_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/reader_moreh_group_norm_backward_input_grad_large.cpp
index a806b32cfe6..ebe55387ba2 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/reader_moreh_group_norm_backward_input_grad_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/reader_moreh_group_norm_backward_input_grad_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/reader_moreh_group_norm_backward_input_grad_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/reader_moreh_group_norm_backward_input_grad_small.cpp
index abe54949f58..5948d37b40d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/reader_moreh_group_norm_backward_input_grad_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/reader_moreh_group_norm_backward_input_grad_small.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/writer_moreh_group_norm_backward_input_grad.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/writer_moreh_group_norm_backward_input_grad.cpp
index 4a6e1efe78b..06039818bcf 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/writer_moreh_group_norm_backward_input_grad.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/kernels/dataflow/writer_moreh_group_norm_backward_input_grad.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_factory.cpp
index c3bb0eda301..81758dadb88 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_factory.cpp
@@ -5,7 +5,7 @@
 #include <iostream>
 
 #include "moreh_group_norm_backward_input_grad_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_group_norm_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp
index b74e23c5017..5088eb26166 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.cpp
@@ -7,9 +7,9 @@
 #include <magic_enum/magic_enum.hpp>
 #include <utility>
 
-#include "common/constants.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/util.hpp>
 
 namespace ttnn {
 namespace operations {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp
index bc49e7efd73..1edd3a5c94e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_helper_functions.hpp
@@ -9,7 +9,7 @@
 #include <optional>
 #include <variant>
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/tensor/tensor.hpp"
 
 namespace ttnn {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_large_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_large_kernel.cpp
index 9e2db506593..27970e685ae 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_large_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_large_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 ALWI bool need_to_do_mask_h(uint32_t w_idx, uint32_t origin_num_h_tiles, uint32_t origin_num_w_tiles) {
     return ((w_idx / origin_num_w_tiles) + 1) % origin_num_h_tiles == 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_small_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_small_kernel.cpp
index 4b4fbe009ce..84c55e29650 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_small_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_small_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 ALWI bool need_to_do_mask_h(uint32_t w_idx, uint32_t origin_num_h_tiles, uint32_t origin_num_w_tiles) {
     return ((w_idx / origin_num_w_tiles) + 1) % origin_num_h_tiles == 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_large.cpp
index c56f60fc5ac..44807265f9a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_small.cpp
index d2652a3c188..53e791262eb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_small.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/writer_moreh_layer_norm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/writer_moreh_layer_norm.cpp
index ef0fd1029b6..08da892875c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/writer_moreh_layer_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/writer_moreh_layer_norm.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 template <typename T>
 void write_mean_rstd(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_program_factory.cpp
index 0c05be75e2e..6f843c11193 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_program_factory.cpp
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "moreh_layer_norm_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_gamma_beta_grad_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_gamma_beta_grad_kernel.cpp
index 18dd55f88df..6ce8b2a2318 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_gamma_beta_grad_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_gamma_beta_grad_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_large_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_large_kernel.cpp
index c28dca031bd..00f6f43a330 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_large_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_large_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 ALWI bool need_to_do_mask_h(uint32_t w_idx, uint32_t origin_num_h_tiles, uint32_t origin_num_w_tiles) {
     return ((w_idx / origin_num_w_tiles) + 1) % origin_num_h_tiles == 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_small_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_small_kernel.cpp
index d5732570eea..ac958841018 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_small_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_small_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 ALWI bool need_to_do_mask_h(uint32_t w_idx, uint32_t origin_num_h_tiles, uint32_t origin_num_w_tiles) {
     return ((w_idx / origin_num_w_tiles) + 1) % origin_num_h_tiles == 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_gamma_beta_grad.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_gamma_beta_grad.cpp
index b11092ab8dd..ac2bdad9414 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_gamma_beta_grad.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_gamma_beta_grad.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 template <typename T>
 void read_mean_rstd(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_input_grad_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_input_grad_large.cpp
index 787b3e15b91..679d5d68e79 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_input_grad_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_input_grad_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 template <typename T>
 void read_mean_rstd(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_input_grad_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_input_grad_small.cpp
index 1b3f45ed72b..c096b4358c7 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_input_grad_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/reader_moreh_layer_norm_backward_input_grad_small.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 template <typename T>
 void read_mean_rstd(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_program_factory.cpp
index a7077d3f664..238dd51f685 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_program_factory.cpp
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_program_factory.cpp
index 04ab2a15695..b71c8205288 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_layer_norm_backward_input_grad_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear/moreh_linear.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear/moreh_linear.cpp
index decd0848484..b134941cb96 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear/moreh_linear.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear/moreh_linear.cpp
@@ -4,7 +4,7 @@
 
 #include "moreh_linear.hpp"
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_matmul/moreh_matmul.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_matmul/moreh_matmul.hpp"
 namespace ttnn::operations::moreh::moreh_linear {
 Tensor MorehLinear::invoke(
     const Tensor& input,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear/moreh_linear.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear/moreh_linear.hpp
index 15d99929da0..926dfd3ebbc 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear/moreh_linear.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear/moreh_linear.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
+#include "cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/decorators.hpp"
 
 namespace ttnn::operations::moreh::moreh_linear {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_multi_core_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_multi_core_h.cpp
index 28db5132fc5..cbf48c98da9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_multi_core_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_multi_core_h.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_single_core_hw.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_single_core_hw.cpp
index 068293c1495..da09c22d1eb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_single_core_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_single_core_hw.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 namespace NAMESPACE {
 void MAIN {
     constexpr int onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/reader_moreh_bias_backward_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/reader_moreh_bias_backward_h.cpp
index 3263b96d40a..1a25621037b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/reader_moreh_bias_backward_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/reader_moreh_bias_backward_h.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 void kernel_main() {
     ArgFetcher arg_fetcher;
     const uint32_t src0_addr = arg_fetcher.get_next_arg_val<uint32_t>();
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/reader_moreh_bias_backward_hw.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/reader_moreh_bias_backward_hw.cpp
index 1e405b99aa0..57b569fe0b9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/reader_moreh_bias_backward_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/reader_moreh_bias_backward_hw.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     ArgFetcher arg_fetcher;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/writer_moreh_bias_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/writer_moreh_bias_backward.cpp
index 56cbe1bd5a7..06bbb3da052 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/writer_moreh_bias_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/writer_moreh_bias_backward.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     ArgFetcher arg_fetcher;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_multi_core_program_factory.cpp
index 145a1b2c384..a659da2bcaf 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_multi_core_program_factory.cpp
@@ -6,8 +6,8 @@
 
 #include "moreh_linear_backward_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
 namespace ttnn::operations::moreh::moreh_linear_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp
index b2c37f0abe8..5981ea6fe58 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp
@@ -6,7 +6,7 @@
 
 #include "moreh_linear_backward_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
 namespace ttnn::operations::moreh::moreh_linear_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/moreh_linear_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/moreh_linear_backward.cpp
index 06e3e75fcde..fb3bf5cdcc6 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/moreh_linear_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/moreh_linear_backward.cpp
@@ -5,8 +5,8 @@
 #include "moreh_linear_backward.hpp"
 
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_matmul/moreh_matmul.hpp"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_matmul/moreh_matmul.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.hpp"
 
 namespace ttnn::operations::moreh::moreh_linear_backward {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/moreh_matmul.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/moreh_matmul.cpp
index c69b0e0867a..466655578d4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/moreh_matmul.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/moreh_matmul.cpp
@@ -5,7 +5,7 @@
 // Implemented based on bmm.cpp
 #include "compute_kernel_api/matmul.h"
 #include "compute_kernel_api/transpose_wh.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/reader_moreh_matmul.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/reader_moreh_matmul.cpp
index 2ed504f3b14..20c3ece163a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/reader_moreh_matmul.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/reader_moreh_matmul.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 static constexpr int32_t MAX_NUM_DIMENSIONS = 8;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/writer_moreh_matmul.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/writer_moreh_matmul.cpp
index 5dd94ea892b..797b6095a55 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/writer_moreh_matmul.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/writer_moreh_matmul.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     // compile-time args
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp
index 35f540c07af..5f3323f70f1 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_matmul_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_h.cpp
index 3621b5eb836..b41e194d279 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_h.cpp
@@ -8,7 +8,7 @@
 #include "compute_kernel_api/mask.h"
 #include "compute_kernel_api/reduce.h"
 #include "compute_kernel_api/tile_move_copy.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_nc.cpp
index 82f063774cc..f0bbc3b6cb5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_nc.cpp
@@ -7,7 +7,7 @@
 #include "compute_kernel_api/bcast.h"
 #include "compute_kernel_api/eltwise_binary.h"
 #include "compute_kernel_api/tile_move_copy.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_w.cpp
index 63068e6a094..8e8ce12e68f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_w.cpp
@@ -9,7 +9,7 @@
 #include "compute_kernel_api/mask.h"
 #include "compute_kernel_api/reduce.h"
 #include "compute_kernel_api/tile_move_copy.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_h.cpp
index c3555c328f0..1b5eab66d3f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_h.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_nc.cpp
index f6ca2f0d3e2..88124c8f040 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_nc.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dprint.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_w.cpp
index 575820eb7b4..c4e01214d5e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_w.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_mm_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_mm_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_h_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_h_program_factory.cpp
index 49f3e218a41..cb94cfa59f4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_h_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_h_program_factory.cpp
@@ -4,9 +4,9 @@
 
 #include <vector>
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "moreh_mean_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/reduction/generic/device/common.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp
index 7b02413820f..1af49940ec3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp
@@ -4,9 +4,9 @@
 
 #include <vector>
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "moreh_mean_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/reduction/generic/device/common.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp
index ddeb9c9ee5a..627a5297811 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp
@@ -4,9 +4,9 @@
 
 #include <vector>
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "moreh_mean_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/reduction/generic/device/common.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp
index 50d3d5c8786..b6cf5095db5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp
@@ -7,7 +7,7 @@
 #include "compute_kernel_api/bcast.h"
 #include "compute_kernel_api/eltwise_binary.h"
 #include "compute_kernel_api/tile_move_copy.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/reader_moreh_mean_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/reader_moreh_mean_backward.cpp
index 000e52db1cb..31f50d4e307 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/reader_moreh_mean_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/reader_moreh_mean_backward.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 static constexpr int32_t MAX_NUM_DIMENSIONS = 8;
 
 inline uint32_t get_output_grad_tile(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/writer_moreh_mean_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/writer_moreh_mean_backward.cpp
index cdd7fbce66e..b1bb3f0c1a3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/writer_moreh_mean_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/writer_moreh_mean_backward.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     // compile-time args
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp
index 6251bef74c3..0e56c22143e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp
@@ -4,9 +4,9 @@
 
 #include <vector>
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "moreh_mean_backward_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/reduction/generic/device/common.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss.cpp
index d474086b96f..1e5b9b7d405 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss.cpp
@@ -8,7 +8,7 @@
 #include "moreh_nll_loss_helper.hpp"
 #include "moreh_nll_loss_step1/device/moreh_nll_loss_step1_device_operation.hpp"
 #include "moreh_nll_loss_step2/device/moreh_nll_loss_step2_device_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.hpp"
 
 namespace ttnn::operations::moreh::moreh_nll_loss {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_pybind.cpp
index 18eec1984dd..07389fa8f90 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 
 #include "moreh_nll_loss.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp
index ec2b103d983..6e89dca5be9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp
index 00fb316bfdb..15748fd4527 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp
index d2027dc4c82..ab6e4dc6cf6 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_nll_loss_step1_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/moreh_nll_loss_step2_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/moreh_nll_loss_step2_kernel.cpp
index 7d4359c5041..f4704085666 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/moreh_nll_loss_step2_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/moreh_nll_loss_step2_kernel.cpp
@@ -4,7 +4,7 @@
 
 #include <cstdint>
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_2d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_2d.cpp
index ae136b1e1d0..09b8e9f08ab 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_2d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_2d.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_3d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_3d.cpp
index 1e5d4f21c9b..7c54fafbea5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_3d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_3d.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_4d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_4d.cpp
index 0e466bf1e10..6860c223048 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_4d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_4d.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_3d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_3d.cpp
index b781db3bee0..22d2228e82b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_3d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_3d.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_4d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_4d.cpp
index 9379cae27be..635040c40da 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_4d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_4d.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp
index 9523da2753b..61778326186 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp
@@ -5,9 +5,9 @@
 #include <cstddef>
 #include <optional>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "moreh_nll_loss_step2_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/moreh_nll_loss_backward_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/moreh_nll_loss_backward_kernel.cpp
index 6809885fd23..d41f0a929f0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/moreh_nll_loss_backward_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/moreh_nll_loss_backward_kernel.cpp
@@ -4,7 +4,7 @@
 
 #include <cstdint>
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 #include "compute_kernel_api/eltwise_unary/eltwise_unary.h"
 
 namespace NAMESPACE {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_2d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_2d.cpp
index 43039225b8e..7dd601ba9ec 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_2d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_2d.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_3d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_3d.cpp
index 915c05deefc..8a189e0d92a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_3d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_3d.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_4d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_4d.cpp
index 20b48c1a92b..83ee5598230 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_4d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_4d.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_program_factory.cpp
index 89eb8b98b1b..acd6dc8e233 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_program_factory.cpp
@@ -4,10 +4,10 @@
 
 #include <optional>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "moreh_nll_loss_backward_device_operation.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_nll_loss_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/moreh_nll_loss_backward_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/moreh_nll_loss_backward_pybind.cpp
index 0d3803c0dd5..953c24278ea 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/moreh_nll_loss_backward_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/moreh_nll_loss_backward_pybind.cpp
@@ -9,7 +9,7 @@
 
 #include "moreh_nll_loss_backward.hpp"
 #include "pybind11/cast.h"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_2d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_2d.cpp
index e8004752f73..68cc4462130 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_2d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_2d.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_3d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_3d.cpp
index 375e0cf1bbf..77f564dd885 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_3d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_3d.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_4d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_4d.cpp
index cca095ea392..7f1d8b15cab 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_4d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_4d.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_program_factory.cpp
index 395ccb90b48..e7cdf50eca4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_program_factory.cpp
@@ -4,10 +4,10 @@
 
 #include <optional>
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "moreh_nll_loss_unreduced_backward_device_operation.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_nll_loss_unreduced_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/moreh_nll_loss_unreduced_backward_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/moreh_nll_loss_unreduced_backward_pybind.cpp
index c2dccbc374b..0ab6d64256a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/moreh_nll_loss_unreduced_backward_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/moreh_nll_loss_unreduced_backward_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "moreh_nll_loss_unreduced_backward.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
index d1559dac39b..2eb853b5d31 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/moreh_norm_other_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/moreh_norm_other_kernel.cpp
index a8f26205e5e..22b9f82d986 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/moreh_norm_other_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/moreh_norm_other_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
index 9212060930a..65a6c1f51a1 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
index dc5646e129c..5dd80e588dc 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/reader_moreh_norm_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/reader_moreh_norm_h.cpp
index 7ec747a04aa..5966700440c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/reader_moreh_norm_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/reader_moreh_norm_h.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/moreh_norm_nc_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/moreh_norm_nc_kernel.cpp
index fc4ad06d691..a1393e93cd3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/moreh_norm_nc_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/moreh_norm_nc_kernel.cpp
@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/reader_moreh_norm_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/reader_moreh_norm_nc.cpp
index d395c4ad46f..3c2777c7b1e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/reader_moreh_norm_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/reader_moreh_norm_nc.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_h_other.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_h_other.cpp
index 542db3490f2..a00a2e881d4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_h_other.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_h_other.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
+#include "cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_norm {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_nc_other.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_nc_other.cpp
index d7d65205e62..5c78cf4d8f7 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_nc_other.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_nc_other.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
+#include "cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_norm {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_w_other.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_w_other.cpp
index af997bf5134..dc02191d81c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_w_other.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_w_other.cpp
@@ -4,8 +4,8 @@
 
 #include <limits>
 
-#include "tt_metal/common/work_split.hpp"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp"
+#include <tt-metalium/work_split.hpp>
+#include "cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_norm {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
index a0825fa33f2..dc721c558cc 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/reader_moreh_norm_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/reader_moreh_norm_w.cpp
index 158e7a598b4..9a930a9df40 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/reader_moreh_norm_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/reader_moreh_norm_w.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     int i{0};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm.cpp
index 68b6688b77d..d67aab54f59 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm.cpp
@@ -5,8 +5,8 @@
 #include "moreh_norm.hpp"
 
 #include "device/moreh_norm_device_operation.hpp"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.hpp"
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.hpp"
 
 namespace ttnn::operations::moreh::moreh_norm {
 Tensor MorehNorm::invoke(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/moreh_norm_backward_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/moreh_norm_backward_kernel.cpp
index 2a6f2914df4..edf232d8597 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/moreh_norm_backward_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/moreh_norm_backward_kernel.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/reader_moreh_norm_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/reader_moreh_norm_backward.cpp
index 4695d599086..2efdbb32116 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/reader_moreh_norm_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/reader_moreh_norm_backward.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 static constexpr int32_t MAX_NUM_DIMENSIONS = 8;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_program_factory.cpp
index 2790474abfd..8c664049194 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_norm_backward_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_norm_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/moreh_sgd.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/moreh_sgd.cpp
index d071bbeebf2..175e4072cc1 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/moreh_sgd.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/moreh_sgd.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/reader_moreh_sgd.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/reader_moreh_sgd.cpp
index 6c579770ca5..97739112b6e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/reader_moreh_sgd.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/reader_moreh_sgd.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/writer_moreh_sgd.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/writer_moreh_sgd.cpp
index 96df5fb0885..6cd182c167c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/writer_moreh_sgd.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/writer_moreh_sgd.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t i = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp
index 4504f3cc6b3..04426beffdb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "moreh_sgd_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_c_large.cpp
index 85badfc447a..583b9b3db7c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_c_large.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_ROW
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h.cpp
index df58e14630c..653d790fc70 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_COL
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h_large.cpp
index 2863552a2cb..a9ee25ff14f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h_large.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_COL
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w.cpp
index 5cb9d57d2b3..415974b0814 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_ROW
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w_large.cpp
index 718de9672b9..0e91c5bdae0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w_large.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_ROW
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h.cpp
index ad89e8648b4..5c8ad902b2f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h_large.cpp
index 10e6fac5b21..2dcf3fcfb35 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w.cpp
index 8bce056e512..6170345d585 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w_large.cpp
index c933790b474..8b8ef089396 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp
index 1a6da69d146..4a9dea99d4d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp
index 3389d0cd4c3..41f6e774509 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp
index 4692b7c73aa..d06bfbe69ed 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp
index e4c09e343f9..e4bc5a6af84 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp
index e12a498f25a..ae44d2b5044 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax.hpp
index 9b777236aae..5594df5d64b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax.hpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax_pybind.cpp
index 745908d3023..b82aba350b3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/moreh_softmax_pybind.cpp
@@ -6,7 +6,7 @@
 
 #include "moreh_softmax.hpp"
 #include "pybind11/decorators.hpp"
-#include "ttnn/cpp/pybind11/export_enum.hpp"
+#include "cpp/pybind11/export_enum.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_c_large.cpp
index f8dd4b1a784..a6c6c635d6d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_c_large.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_ROW
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h.cpp
index f873a0c16ae..82ed70f676d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_COL
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h_large.cpp
index 1bd7eb3c191..aa9e128f887 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h_large.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_COL
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w.cpp
index 1fc611b0a43..cdc41f87694 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_ROW
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w_large.cpp
index 38ae32f1de9..bc6b4afc251 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w_large.cpp
@@ -7,7 +7,7 @@
 #define REDUCE_OP PoolType::SUM
 #define REDUCE_DIM ReduceDim::REDUCE_ROW
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h.cpp
index d57dae337a0..e468e2cbf27 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t y_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h_large.cpp
index 7714a5018a3..3e027f4450c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t y_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w.cpp
index 0333c06c407..36e1bf93a47 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t y_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w_large.cpp
index 0432e196056..43b757865d3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     uint32_t y_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp
index fdcd5c2002d..a3d0bafee48 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp
index 57f161e2c8a..eeb7cb31f63 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp
index 91d34697b42..3b0eb77e08a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp
index 551f907a3fc..49353ddee6b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp
index 023e777de11..2dbed5b9806 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax_backward {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward.hpp
index 5ff5a2b30b5..aa223b8e070 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward.hpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax_backward/device/moreh_softmax_backward_device_operation.hpp"
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward_pybind.cpp
index 4969c54006d..cd959294124 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/moreh_softmax_backward_pybind.cpp
@@ -6,7 +6,7 @@
 
 #include "moreh_softmax_backward.hpp"
 #include "pybind11/decorators.hpp"
-#include "ttnn/cpp/pybind11/export_enum.hpp"
+#include "cpp/pybind11/export_enum.hpp"
 
 namespace ttnn::operations::moreh::moreh_softmax_backward {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_h_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_h_program_factory.cpp
index 2f6534bc8e6..16cc29f9a75 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_h_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_h_program_factory.cpp
@@ -6,7 +6,7 @@
 
 #include "moreh_sum_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
 namespace ttnn::operations::moreh::moreh_sum {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_nc_program_factory.cpp
index 4391adca800..c327d03ecbd 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_nc_program_factory.cpp
@@ -5,8 +5,8 @@
 #include <vector>
 
 #include "moreh_sum_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/util.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_w_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_w_program_factory.cpp
index 60703cb9f5f..9c58b3687fa 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_w_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_w_program_factory.cpp
@@ -6,7 +6,7 @@
 
 #include "moreh_sum_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
 namespace ttnn::operations::moreh::moreh_sum {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.cpp
index 883a86fc4be..d5e809b677c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_device_operation.cpp
@@ -6,7 +6,7 @@
 
 #include <cstdint>
 
-#include "common/base_types.hpp"
+#include <tt-metalium/base_types.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_int_sum_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_int_sum_h.cpp
index cd10aa79d7c..fa6a44597c0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_int_sum_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_int_sum_h.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "compute_kernel_api/eltwise_unary/sfpu_int_sum.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_sum_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_sum_h.cpp
index 81c9a2d001d..9af5b0545aa 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_sum_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_sum_h.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/reader_moreh_int_sum_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/reader_moreh_int_sum_h.cpp
index 055f750f646..a5e2360bf28 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/reader_moreh_int_sum_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/reader_moreh_int_sum_h.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/reader_moreh_sum_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/reader_moreh_sum_h.cpp
index 0895620fcb8..44c6787c548 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/reader_moreh_sum_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/reader_moreh_sum_h.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp
index 3aa12589286..6757fbe4008 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp
@@ -4,7 +4,7 @@
 
 #include <vector>
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "moreh_sum_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_int_sum_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_int_sum_nc.cpp
index cc496e6891d..eae4e467745 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_int_sum_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_int_sum_nc.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "compute_kernel_api/eltwise_unary/sfpu_int_sum.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp
index 7459e4bb401..7ac9928ca1b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc_gs.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc_gs.cpp
index 660a85c79cd..f56f29a68b5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc_gs.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc_gs.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/reader_moreh_sum_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/reader_moreh_sum_nc.cpp
index 93a9564912e..66926a6993e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/reader_moreh_sum_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/reader_moreh_sum_nc.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 
 inline uint32_t get_read_tile_id(uint32_t output_tile_id, uint32_t reduce_tile_size, uint32_t inner_tile_size) {
     return ((output_tile_id / inner_tile_size) * reduce_tile_size) + (output_tile_id % inner_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/writer_moreh_sum_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/writer_moreh_sum_nc.cpp
index b391bd2ab39..ac7b576f971 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/writer_moreh_sum_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/writer_moreh_sum_nc.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     // compile-time args
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp
index 91ff0083532..15b07e6d48e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp
@@ -6,7 +6,7 @@
 
 #include "moreh_sum_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
 namespace ttnn::operations::moreh::moreh_sum {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_int_sum_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_int_sum_w.cpp
index 03ccff8088c..d56366b3df5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_int_sum_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_int_sum_w.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "compute_kernel_api/eltwise_unary/sfpu_int_sum.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_sum_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_sum_w.cpp
index 7ea6b595229..39e3d20ef7d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_sum_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_sum_w.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "compute_kernel_api/matmul.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/reader_moreh_int_sum_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/reader_moreh_int_sum_w.cpp
index 6a00cabad26..3cb522c2bd5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/reader_moreh_int_sum_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/reader_moreh_int_sum_w.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/reader_moreh_sum_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/reader_moreh_sum_w.cpp
index 2b82a8908b1..159399f5b57 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/reader_moreh_sum_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/reader_moreh_sum_w.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_mm_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_mm_scaler.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/moreh_sum_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/moreh_sum_backward.cpp
index b0d82e5f833..026faaef0ad 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/moreh_sum_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/moreh_sum_backward.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 namespace NAMESPACE {
 void MAIN {
     // compile-time args
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/reader_moreh_sum_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/reader_moreh_sum_backward.cpp
index 3a20c081406..f79761a1119 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/reader_moreh_sum_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/reader_moreh_sum_backward.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 static constexpr int32_t MAX_NUM_DIMENSIONS = 8;
 
 inline uint32_t get_output_grad_tile(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/writer_moreh_sum_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/writer_moreh_sum_backward.cpp
index cdd7fbce66e..b1bb3f0c1a3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/writer_moreh_sum_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/writer_moreh_sum_backward.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     // compile-time args
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_program_factory.cpp
index 723291ccf0f..a94dab35466 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "moreh_sum_backward_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp
index f52a2192b9f..5ff56fe761d 100644
--- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/batch_norm_program_factory.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "batch_norm_device_operation.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/cb_utils.hpp"
 #include <cmath>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp
index 7416cb88d97..addf249190c 100644
--- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/compute/batch_norm_kernel.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "compute_kernel_api/eltwise_binary.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
 
 #include <cstdint>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp
index 1ff9e98249f..6ac8233dd24 100644
--- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/reader_batch_norm.cpp
@@ -5,8 +5,8 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
 
 void kernel_main() {
     const auto eps = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp
index 44396001114..72bfbaeef7f 100644
--- a/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/batch_norm/device/kernels/dataflow/writer_batch_norm.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
+#include "cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/fill_tile_utils.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);        // batch_mean
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.cpp
index f962c112c6e..4607dfc9b7a 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.cpp
@@ -7,9 +7,9 @@
 #include <optional>
 
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/writer_unary_sharded_gn_rm_gb_v2.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/writer_unary_sharded_gn_rm_gb_v2.cpp
index a2a5993b0db..503101d4fff 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/writer_unary_sharded_gn_rm_gb_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/writer_unary_sharded_gn_rm_gb_v2.cpp
@@ -5,8 +5,8 @@
 #include <stdint.h>
 #include "dataflow_api.h"
 #include "hostdevcommon/common_values.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 // #include "debug/dprint.h"
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
index cf8f62d1825..c9b8ab622a2 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "impl/buffers/circular_buffer_types.hpp"
+#include <tt-metalium/circular_buffer_types.hpp>
 #include "ttnn/operations/normalization/groupnorm/device/groupnorm_op.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 #include <optional>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/groupnorm_pybind.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/groupnorm_pybind.cpp
index 4a3244595a3..991e1990529 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/groupnorm_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/groupnorm_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "groupnorm.hpp"
 
 namespace ttnn::operations::normalization::detail {
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp
index d701a1adb66..9a5dd4280fb 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp
@@ -4,8 +4,8 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb.cpp
index 955e42579d7..a01799b2e73 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb.cpp
@@ -4,8 +4,8 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln.cpp
index 4798e45798d..f8406281d29 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln.cpp
@@ -5,8 +5,8 @@
 #include <stdint.h>
 #include "dataflow_api.h"
 #include "hostdevcommon/common_values.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 void kernel_main() {
     constexpr bool is_all_to_all_worker = get_compile_time_arg_val(0) == 1;
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_pre_all_gather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_pre_all_gather.cpp
index e5d69416bd1..d0cf75d0ec6 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_pre_all_gather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_pre_all_gather.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 #include "dataflow_api.h"
 #include "hostdevcommon/common_values.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 
 void kernel_main() {
     constexpr bool is_all_to_all_worker = get_compile_time_arg_val(0) == 1;
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_rm_gb.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_rm_gb.cpp
index 93f56891a99..7f5241a9da1 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_rm_gb.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_rm_gb.cpp
@@ -5,8 +5,8 @@
 #include <stdint.h>
 #include "dataflow_api.h"
 #include "hostdevcommon/common_values.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 void kernel_main() {
     constexpr bool is_all_to_all_worker = get_compile_time_arg_val(0) == 1;
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.cpp
index 82318c50330..6c1dcde1d01 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_op.cpp
@@ -7,7 +7,7 @@
 #include "ttnn/run_operation.hpp"
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 #include <optional>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_types.hpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_types.hpp
index 89b49104099..b397ebe00af 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_types.hpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/layernorm_types.hpp
@@ -6,7 +6,7 @@
 
 #include <variant>
 
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn::operations::normalization {
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
index b844b4d88a9..df1a9d00a63 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "impl/buffers/circular_buffer_types.hpp"
+#include <tt-metalium/circular_buffer_types.hpp>
 #include "ttnn/operations/normalization/layernorm/device/layernorm_op.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 #include <optional>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/layernorm_pybind.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/layernorm_pybind.cpp
index 3a12f084436..f115c046986 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/layernorm_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/layernorm_pybind.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "layernorm_pybind.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "layernorm.hpp"
 
 namespace ttnn::operations::normalization::detail {
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_post_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_post_allgather.cpp
index 3a6c00fcdbf..721af6a8e2a 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_post_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_post_allgather.cpp
@@ -8,8 +8,8 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 #include "debug/assert.h"
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_pre_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_pre_allgather.cpp
index 1a4f5205742..f2e6e8d8617 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_pre_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_pre_allgather.cpp
@@ -8,8 +8,8 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 #include "debug/assert.h"
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_distributed_types.hpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_distributed_types.hpp
index b663b547a50..0c39362273a 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_distributed_types.hpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_distributed_types.hpp
@@ -6,7 +6,7 @@
 
 #include <variant>
 
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn::operations::normalization {
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.cpp
index 7628a438671..d2b9edfcb1e 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.cpp
@@ -3,13 +3,13 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "layernorm_post_all_gather_op.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/run_operation.hpp"
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 #include <magic_enum/magic_enum.hpp>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.cpp
index c0c1f08fc12..fd15af62b77 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.cpp
@@ -3,13 +3,13 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "layernorm_pre_all_gather_op.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/run_operation.hpp"
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
 #include <magic_enum/magic_enum.hpp>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
index 0ba5db2b42e..429561a706e 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include "cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_post_all_gather_op.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/circular_buffer.hpp>
 #include <optional>
 #include <variant>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
index a05c1ee192b..2d97df45d98 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
@@ -2,14 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include "cpp/ttnn/operations/normalization/layernorm_distributed/device/layernorm_pre_all_gather_op.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/circular_buffer.hpp>
 #include <optional>
 #include <variant>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/layernorm_distributed_pybind.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/layernorm_distributed_pybind.cpp
index 3518aff13c9..6917d0c90d6 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/layernorm_distributed_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/layernorm_distributed_pybind.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "layernorm_distributed_pybind.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "layernorm_pre_all_gather.hpp"
 #include "layernorm_post_all_gather.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/normalization_pybind.hpp b/ttnn/cpp/ttnn/operations/normalization/normalization_pybind.hpp
index d8145a87cb3..d40620b3c32 100644
--- a/ttnn/cpp/ttnn/operations/normalization/normalization_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/normalization/normalization_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "softmax/softmax_pybind.hpp"
 #include "layernorm/layernorm_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/normalization/rmsnorm/rmsnorm_pybind.cpp b/ttnn/cpp/ttnn/operations/normalization/rmsnorm/rmsnorm_pybind.cpp
index 343afa44d3d..306312bb5fb 100644
--- a/ttnn/cpp/ttnn/operations/normalization/rmsnorm/rmsnorm_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/rmsnorm/rmsnorm_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "rmsnorm.hpp"
 
 namespace ttnn::operations::normalization::detail {
diff --git a/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed/rmsnorm_distributed_pybind.cpp b/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed/rmsnorm_distributed_pybind.cpp
index c802a78e645..ffdda71006d 100644
--- a/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed/rmsnorm_distributed_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed/rmsnorm_distributed_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "rmsnorm_pre_all_gather.hpp"
 #include "rmsnorm_post_all_gather.hpp"
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/readed_unary_sharded_sm_causal_mask_hw_dims.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/readed_unary_sharded_sm_causal_mask_hw_dims.cpp
index f2caba34673..f4eb4c115b3 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/readed_unary_sharded_sm_causal_mask_hw_dims.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/readed_unary_sharded_sm_causal_mask_hw_dims.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 // HW-bcast scale for fused scale-attn-softmax
 FORCE_INLINE void generate_inv_sqrt_hw_bcast_tile() {
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp
index 9a20012f7db..edb8247cbc0 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 void kernel_main() {
     const uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm.cpp
index 7cc23574a24..55c2518cf32 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 // HW-bcast scale for fused scale-attn-softmax
 FORCE_INLINE void generate_inv_sqrt_hw_bcast_tile() {
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm_rm_mask.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm_rm_mask.cpp
index 9fc7ab155a6..0b916e04ff8 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm_rm_mask.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm_rm_mask.cpp
@@ -4,8 +4,8 @@
 
 // #include "debug/dprint.h"
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
 
 void kernel_main() {
 #if FUSED_SCALE_MASK
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
index 651c2e6c0e3..4019f1a09e7 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
@@ -2,18 +2,18 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/logger.hpp"
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/buffer.hpp>
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/normalization/softmax/device/softmax_op.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/run_operation.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/util.hpp>
 
 #include <optional>
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp
index 3d708433a84..597185ceef8 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp
@@ -3,17 +3,17 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "softmax_op.hpp"
-#include "tt_metal/common/assert.hpp"
-#include "common/base_types.hpp"
+#include <tt-metalium/assert.hpp>
+#include <tt-metalium/base_types.hpp>
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/run_operation.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/util.hpp>
 
 #include <optional>
 #include <type_traits>
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp
index 8f7f1fde1ae..67a1ffe7dbe 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_op.hpp
@@ -5,8 +5,8 @@
 #pragma once
 
 #include <cmath>
-#include "common/base_types.hpp"
-#include "common/core_coord.hpp"
+#include <tt-metalium/base_types.hpp>
+#include <tt-metalium/core_coord.hpp>
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operation.hpp"
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_types.hpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_types.hpp
index bcbceedba83..95227c37112 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_types.hpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/softmax_types.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn::operations::normalization {
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/softmax.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/softmax.cpp
index 8a4d7598370..2c7b2eda5f8 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/softmax.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/softmax.cpp
@@ -4,7 +4,7 @@
 
 #include "softmax.hpp"
 
-#include "ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
+#include "cpp/ttnn/operations/moreh/moreh_softmax/device/moreh_softmax_device_operation.hpp"
 #include "device/softmax_op.hpp"
 #include "ttnn/operations/core/core.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/softmax_pybind.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/softmax_pybind.cpp
index 8a938c8d6c9..b4e87d7387f 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/softmax_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/softmax_pybind.cpp
@@ -6,7 +6,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "softmax.hpp"
 
 namespace ttnn::operations::normalization::detail {
diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.cpp b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.cpp
index 1d9abd72cb4..c5cb85a9328 100644
--- a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.cpp
@@ -8,7 +8,7 @@
 #include <math.h>
 
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
index 66a3de1c918..6314884f972 100644
--- a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
@@ -8,11 +8,11 @@
 
 #include "ttnn/common/constants.hpp"
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/data_movement/untilize/untilize.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.hpp b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.hpp
index 153c489e2a8..9cc4106d746 100644
--- a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.hpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operation.hpp"
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/downsample_pybind.cpp b/ttnn/cpp/ttnn/operations/pool/downsample/downsample_pybind.cpp
index c7f4bc2a480..2a6728772c4 100644
--- a/ttnn/cpp/ttnn/operations/pool/downsample/downsample_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/downsample/downsample_pybind.cpp
@@ -4,7 +4,7 @@
 
 #include "downsample.hpp"
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp
index cdb290e8d18..4d77a49dd01 100644
--- a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp
@@ -4,7 +4,7 @@
 
 #include "pool_op.hpp"
 #include "ttnn/operations/reduction/generic/device/reduce_op.hpp"  // for reduce_op_utils
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/math.hpp>
 
 /**
  * Generic pool implementation that uses the new sliding window infrastructure.
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.cpp b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.cpp
index 018015d21d7..2a322471a9c 100644
--- a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.cpp
@@ -4,7 +4,7 @@
 
 #include "pool_op.hpp"
 
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/math.hpp>
 #include <utility>
 
 /**
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp
index 0ab122cea54..7c83971a419 100644
--- a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp
+++ b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp
@@ -12,7 +12,7 @@
 #include "ttnn/device_operation.hpp"
 #include "ttnn/types.hpp"
 #include "ttnn/operations/conv/conv2d/conv2d.hpp"
-#include "ttnn/cpp/ttnn/operations/sliding_window/sliding_window.hpp"
+#include "cpp/ttnn/operations/sliding_window/sliding_window.hpp"
 #include "ttnn/decorators.hpp"
 
 namespace ttnn::operations {
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp
index 6f6dc635a2e..83c4d70a84b 100644
--- a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools.cpp
@@ -4,13 +4,13 @@
 
 #include "generic_pools.hpp"
 
-#include "impl/buffers/buffer_constants.hpp"
+#include <tt-metalium/buffer_constants.hpp>
 #include "ttnn/operations/conv/conv2d/conv2d_utils.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/sliding_window/halo/halo.hpp"
 #include "ttnn/operations/sliding_window/sliding_window.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/math.hpp>
 
 #include <limits>
 
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools_pybind.cpp b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools_pybind.cpp
index 0d2ed290428..9b21cd943fa 100644
--- a/ttnn/cpp/ttnn/operations/pool/generic/generic_pools_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/generic/generic_pools_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/types.hpp"
 
 namespace ttnn::operations::pool {
diff --git a/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp
index f4c22b5b805..83e594755fe 100644
--- a/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp
+++ b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/tensor/tensor.hpp"
 
 #include "ttnn/operation.hpp"
diff --git a/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool_pybind.hpp b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool_pybind.hpp
index b262f239638..076ca2a6cb6 100644
--- a/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp"
 #include "ttnn/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp
index 0bd7f083ea8..cfad84ebe15 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 #define ALWI inline __attribute__((always_inline))
 
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
index 726cca086bc..8384d8b8c73 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
@@ -7,14 +7,14 @@
 #include "upsample_op.hpp"
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/math.hpp>
 // #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/operations/reduction/generic/device/reduce_op.hpp"  // for reduce_op_utils
 
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/reflection.hpp>
 #include "ttnn/operations/functions.hpp"
 #include "ttnn/operations/sliding_window/sliding_window.hpp"
 #include "ttnn/operations/sliding_window/halo/halo.hpp"
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
index 3e0c54f5d33..6e07f7386bd 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
@@ -7,11 +7,11 @@
 #include <algorithm>
 #include <cmath>
 
-#include "detail/util.hpp"
+#include <tt-metalium/util.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace ttnn::operations::upsample {
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp
index cf8cd0ff4cb..83048f71d0a 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp
@@ -9,12 +9,12 @@
 #include "upsample_op.hpp"
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/math.hpp>
 
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/reflection.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_singlecore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_singlecore.cpp
index e6ed93a18c2..53d51f9ff41 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_singlecore.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_singlecore.cpp
@@ -7,12 +7,12 @@
 #include "upsample_op.hpp"
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/math.hpp>
 
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/reflection.hpp>
 
 using namespace tt::constants;
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/upsample_pybind.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/upsample_pybind.cpp
index 06c72788e1f..8522c3ec0f8 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/upsample_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/upsample_pybind.cpp
@@ -5,7 +5,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "upsample.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp
index 794ed6d4290..d31e9297318 100644
--- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp
+++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.cpp
@@ -6,7 +6,7 @@
 #include "ttnn/run_operation.hpp"
 #include "ttnn/operations/math.hpp"
 
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 
 #include <optional>
 
diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp
index d224a4ee1ac..e298c30ac7b 100644
--- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp
+++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op.hpp
@@ -10,8 +10,8 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/core/core.hpp"
 
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include <tt-metalium/global_circular_buffer.hpp>
 
 namespace ttnn::operations::dram_prefetcher {
 
diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op_multi_core.cpp
index 2bb2ba883bd..73878002e7b 100644
--- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/dram_prefetcher_op_multi_core.cpp
@@ -5,13 +5,13 @@
 #include <stdint.h>
 
 #include "dram_prefetcher_op.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
 
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include <tt-metalium/global_circular_buffer.hpp>
 
 namespace ttnn::operations::dram_prefetcher {
 
diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/kernels/reader_dram.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/kernels/reader_dram.cpp
index 4b14cc80bad..41afc5d91a7 100644
--- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/kernels/reader_dram.cpp
+++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/device/kernels/reader_dram.cpp
@@ -5,7 +5,7 @@
 #include <stdint.h>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
+#include "cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"
 
 #include "debug/dprint.h"
 
diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp
index fb6134d4e1e..62730c09705 100644
--- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp
+++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.cpp
@@ -6,8 +6,8 @@
 #include <optional>
 
 #include "device/dram_prefetcher_op.hpp"
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include <tt-metalium/global_circular_buffer.hpp>
 
 namespace ttnn::operations::dram_prefetcher {
 
diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp
index a79679ed4df..4efc5e7d03d 100644
--- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp
+++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher.hpp
@@ -6,8 +6,8 @@
 
 #include "ttnn/decorators.hpp"
 
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/include/tt_metal/global_circular_buffer.hpp"
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include <tt-metalium/global_circular_buffer.hpp>
 
 namespace ttnn {
 namespace operations::dram_prefetcher {
diff --git a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher_pybind.cpp b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher_pybind.cpp
index 03059a2cd23..49c285a28f4 100644
--- a/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/prefetcher/prefetcher/dram_prefetcher_pybind.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dram_prefetcher_pybind.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "dram_prefetcher.hpp"
 
 namespace ttnn::operations::dram_prefetcher::detail {
diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax_pybind.hpp b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax_pybind.hpp
index 1526cb7cfc4..4f6468aa689 100644
--- a/ttnn/cpp/ttnn/operations/reduction/argmax/argmax_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/reduction/argmax/argmax_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/reduction/argmax/argmax.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.cpp
index ac9373ba7f5..031559a7506 100644
--- a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.cpp
@@ -4,10 +4,10 @@
 #include <algorithm>
 
 #include "ttnn/operations/math.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operation.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_reduce_interleaved_start_id.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_reduce_interleaved_start_id.cpp
index a1a381a8e72..4c7bcc51651 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_reduce_interleaved_start_id.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_reduce_interleaved_start_id.cpp
@@ -5,9 +5,9 @@
 #include <stdint.h>
 #include "dataflow_api.h"
 #ifndef REDUCE_ROW_SUM_VIA_MM
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 #else
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_mm_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_mm_scaler.hpp"
 #endif
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_input_cols_partitioned.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_input_cols_partitioned.cpp
index c8e5087e90b..f23af07528f 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_input_cols_partitioned.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_input_cols_partitioned.cpp
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 
 void kernel_main() {
     uint32_t src_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_input_cols_partitioned_sharded.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_input_cols_partitioned_sharded.cpp
index aa4506fc026..d12a7f989b7 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_input_cols_partitioned_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_input_cols_partitioned_sharded.cpp
@@ -4,7 +4,7 @@
 
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 
 void kernel_main() {
     uint32_t num_tiles = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
index 8390c5a92fe..d8e38648f0c 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/reduction/generic/device/reduce_op.hpp"
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp
index 299f406cfdc..3809a95a66e 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/operations/reduction/generic/device/reduce_op.hpp"
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp
index aac99127ab1..7efc463fee2 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/reduction/generic/device/reduce_op.hpp"
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/generic_reductions_pybind.hpp b/ttnn/cpp/ttnn/operations/reduction/generic/generic_reductions_pybind.hpp
index 51408b93e4e..af6866923a9 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/generic_reductions_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/generic_reductions_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::reduction::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/device/kernels/dataflow/writer_unary_interleaved.cpp b/ttnn/cpp/ttnn/operations/reduction/moe/device/kernels/dataflow/writer_unary_interleaved.cpp
index f869ee51af8..4fa542766e4 100644
--- a/ttnn/cpp/ttnn/operations/reduction/moe/device/kernels/dataflow/writer_unary_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/moe/device/kernels/dataflow/writer_unary_interleaved.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 #include <stdint.h>
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 
 void kernel_main() {
     uint32_t dst_addr0 = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.cpp
index 3b2b3738952..e09acc5a8de 100644
--- a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.cpp
@@ -1,12 +1,12 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_log.h"
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_log.h>
+#include <tt-metalium/math.hpp>
 #include "ttnn/operation.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/moe_pybind.hpp b/ttnn/cpp/ttnn/operations/reduction/moe/moe_pybind.hpp
index 0c2b3281fee..8c5a2ad83b1 100644
--- a/ttnn/cpp/ttnn/operations/reduction/moe/moe_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/reduction/moe/moe_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "moe.hpp"
 #include <optional>
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/dataflow/reader_prod_nc.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/dataflow/reader_prod_nc.cpp
index 939f7e3217a..a9b76571b5f 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/dataflow/reader_prod_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/dataflow/reader_prod_nc.cpp
@@ -5,7 +5,7 @@
 #include <cstdint>
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
 
 void kernel_main() {
     const auto input_addr = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp
index d164a0c8b63..8015799a2b8 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp
@@ -3,9 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/operations/reduction/prod/device/prod_op_all.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace tt {
 using namespace constants;
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_op.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_op.cpp
index 1bd009cccc1..11008069d72 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_op.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_op.cpp
@@ -5,8 +5,8 @@
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 #include "prod_nc_op.hpp"
 
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/host_api.hpp>
 
 namespace tt {
 
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp
index 05b45872e4d..38d4f885d43 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/work_split.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operation.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp
index 7603863c7d9..894a25ae5ff 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp
@@ -7,10 +7,10 @@
 
 #include "prod_op_all.hpp"
 #include "ttnn/operations/eltwise/unary/unary.hpp"
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include <ttnn/operations/functions.hpp>
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
+#include <tt-metalium/host_api.hpp>
+#include "tools/profiler/op_profiler.hpp"
 
 namespace tt {
 using namespace constants;
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp
index eefb92a2ce7..f79ff824768 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp
@@ -6,13 +6,13 @@
 #include "device/prod_nc_op.hpp"
 #include "device/prod_op_all.hpp"
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
-#include "ttnn/cpp/ttnn/operations/creation.hpp"
+#include "cpp/ttnn/operations/creation.hpp"
 #include "ttnn/operations/data_movement/slice/slice.hpp"
 #include "ttnn/operations/data_movement/permute/permute.hpp"
 #include "ttnn/operations/functions.hpp"
 #include "ttnn/types.hpp"
 #include "ttnn/common/constants.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/squeeze/squeeze.hpp"
+#include "cpp/ttnn/operations/data_movement/squeeze/squeeze.hpp"
 #include "ttnn/operations/core/core.hpp"
 
 namespace ttnn::operations::reduction {
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/prod_pybind.hpp b/ttnn/cpp/ttnn/operations/reduction/prod/prod_pybind.hpp
index 150cfc20249..057f701ec72 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/prod_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/prod_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "ttnn/operations/reduction/prod/prod.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/reduction/reduction_pybind.cpp b/ttnn/cpp/ttnn/operations/reduction/reduction_pybind.cpp
index f2d39d6bcdb..7bdc94f6922 100644
--- a/ttnn/cpp/ttnn/operations/reduction/reduction_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/reduction_pybind.cpp
@@ -4,7 +4,7 @@
 
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/export_enum.hpp"
+#include "cpp/pybind11/export_enum.hpp"
 
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 #include "ttnn/operations/reduction/generic/generic_reductions_pybind.hpp"
diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp
index 1996aacd555..bcf84ed52ad 100644
--- a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp
@@ -2,11 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_log.h"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_log.h>
 
 namespace ttnn::operations::reduction::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/topk_pybind.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/topk_pybind.hpp
index eea7d96dbd5..018cf18d1c5 100644
--- a/ttnn/cpp/ttnn/operations/reduction/topk/topk_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/reduction/topk/topk_pybind.hpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "topk.hpp"
 #include <optional>
diff --git a/ttnn/cpp/ttnn/operations/sharding_utilities.hpp b/ttnn/cpp/ttnn/operations/sharding_utilities.hpp
index eb515e30769..eab85a7096a 100644
--- a/ttnn/cpp/ttnn/operations/sharding_utilities.hpp
+++ b/ttnn/cpp/ttnn/operations/sharding_utilities.hpp
@@ -8,10 +8,10 @@
 
 #pragma once
 
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/common/core_coord.hpp"
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/core_coord.hpp>
 
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/host_api.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp
index 04b10368639..987f76bce22 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp
@@ -8,7 +8,7 @@
 #include <numeric>
 #include <tuple>
 
-#include "impl/device/device.hpp"
+#include <tt-metalium/device_impl.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp
index 8656624bb24..384cc0f697a 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.hpp
@@ -7,7 +7,7 @@
 #include <cstdint>
 #include <cstdlib>
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp
index aa92f35bb2f..c7af8f4cf44 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp
@@ -4,7 +4,7 @@
 
 #include "sliding_window.hpp"
 #include <vector>
-#include "tt_metal/common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window_pybind.cpp b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window_pybind.cpp
index 94dc80c928b..dac35a3430e 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window_pybind.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "sliding_window.hpp"
 
 using namespace tt::tt_metal;
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/utils.hpp b/ttnn/cpp/ttnn/operations/sliding_window/utils.hpp
index 8ff65346658..870172c6123 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/utils.hpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/utils.hpp
@@ -4,8 +4,8 @@
 
 #pragma once
 
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/math.hpp"
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/math.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/attention_softmax/attention_softmax_pybind.cpp b/ttnn/cpp/ttnn/operations/transformer/attention_softmax/attention_softmax_pybind.cpp
index 85e0fddd01e..0c3dca7768b 100644
--- a/ttnn/cpp/ttnn/operations/transformer/attention_softmax/attention_softmax_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/attention_softmax/attention_softmax_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "attention_softmax.hpp"
 
 namespace ttnn::operations::transformer {
diff --git a/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads.cpp b/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads.cpp
index 889d7513b57..b8a087ba3d6 100644
--- a/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads.cpp
@@ -4,7 +4,7 @@
 
 #include "concatenate_heads.hpp"
 
-#include "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp"
+#include "cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp"
 
 using namespace tt::tt_metal;
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads_pybind.cpp b/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads_pybind.cpp
index 1f488138736..b1d164c357b 100644
--- a/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 #include "concatenate_heads.hpp"
 
 namespace ttnn::operations::transformer {
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp
index 5ac5d251036..c115892b060 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 
 template <uint32_t tile_bytes, uint32_t num_readers>
 constexpr uint32_t get_barrier_read_threshold() {
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp
index 6264a2540b4..cc6d481809a 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp
@@ -7,11 +7,11 @@
 
 #include <optional>
 
-#include "impl/buffers/buffer.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operations/math.hpp"
 #include "ttnn/operation.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa_pybind.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa_pybind.cpp
index 54817868168..8440728d920 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "sdpa.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_config.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_config.hpp
index c968f5d8a7f..033a6be134d 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_config.hpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_config.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <cstddef>
-#include "common/core_coord.hpp"
+#include <tt-metalium/core_coord.hpp>
 
 namespace ttnn::operations::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp
index 709ed36da7f..8b571c1b1b1 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp
@@ -16,7 +16,7 @@
 #include "compute_kernel_api/matmul.h"
 #include "compute_kernel_api/reduce.h"
 
-#include "ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp"
+#include "cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp"
 #include "compute_common.hpp"
 
 namespace NAMESPACE {
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp
index dc6480bdd52..370a21271e6 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp
@@ -6,7 +6,7 @@
 #include "dataflow_api.h"
 #include <vector>
 
-#include "ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp"
+#include "cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp"
 #include "dataflow_common.hpp"
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp
index 59581a13ee1..b4289fcd51c 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "dataflow_api.h"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
-#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+#include "cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp"
 #include "debug/assert.h"
 
-#include "ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp"
+#include "cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp"
 #include "dataflow_common.hpp"
 
 void kernel_main() {
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp
index 3c08aee9e91..d823360714a 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tt_metal/common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include <tuple>
 uint32_t nearest_n(uint32_t x, uint32_t n) { return ((x + n - 1) / n) * n; }
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp
index 67dbada8595..3b708476ddc 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp
@@ -6,12 +6,12 @@
 
 #include <optional>
 
-#include "impl/buffers/buffer.hpp"
+#include <tt-metalium/buffer.hpp>
 #include "sdpa_decode_op.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/detail/util.hpp"
-#include "tt_metal/host_api.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/logger.hpp>
+#include <tt-metalium/util.hpp>
+#include <tt-metalium/host_api.hpp>
 #include "ttnn/operation.hpp"
 
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode_pybind.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode_pybind.cpp
index bb90bae8ada..ef9981731b2 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode_pybind.cpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "sdpa_decode.hpp"
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 namespace ttnn::operations::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.cpp b/ttnn/cpp/ttnn/operations/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.cpp
index 4ac733211eb..83d5b954b83 100644
--- a/ttnn/cpp/ttnn/operations/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads.cpp
@@ -6,9 +6,9 @@
 
 #include "ttnn/operations/core/core.hpp"
 
-#include "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.hpp"
-#include "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.hpp"
-#include "ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.hpp"
+#include "cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads.hpp"
+#include "cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.hpp"
+#include "cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.hpp"
 
 namespace ttnn::operations::transformer {
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.cpp b/ttnn/cpp/ttnn/operations/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.cpp
index 8a6b02e6d07..d6fcee4d225 100644
--- a/ttnn/cpp/ttnn/operations/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/split_query_key_value_and_split_heads/split_query_key_value_and_split_heads_pybind.cpp
@@ -7,7 +7,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "ttnn/cpp/pybind11/decorators.hpp"
+#include "cpp/pybind11/decorators.hpp"
 
 #include "split_query_key_value_and_split_heads.hpp"
 
diff --git a/ttnn/cpp/ttnn/operations/uniform/device/kernels/writer_uniform.cpp b/ttnn/cpp/ttnn/operations/uniform/device/kernels/writer_uniform.cpp
index d5b2c498b68..152ba0ae132 100644
--- a/ttnn/cpp/ttnn/operations/uniform/device/kernels/writer_uniform.cpp
+++ b/ttnn/cpp/ttnn/operations/uniform/device/kernels/writer_uniform.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "common/constants.hpp"
+#include <tt-metalium/constants.hpp>
 #include "dataflow_api.h"
 
 using namespace tt;
diff --git a/ttnn/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp b/ttnn/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp
index 709ec884786..6beef8022f4 100644
--- a/ttnn/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp
@@ -1,8 +1,8 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "common/constants.hpp"
-#include "tt_metal/common/work_split.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/work_split.hpp>
 #include "ttnn/tensor/types.hpp"
 #include "uniform_device_operation.hpp"
 
diff --git a/ttnn/cpp/ttnn/reports.hpp b/ttnn/cpp/ttnn/reports.hpp
index dc347b1e916..67843fbcb7e 100644
--- a/ttnn/cpp/ttnn/reports.hpp
+++ b/ttnn/cpp/ttnn/reports.hpp
@@ -6,8 +6,8 @@
 #include <filesystem>
 #include <optional>
 
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/device_pool.hpp>
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/run_operation.cpp b/ttnn/cpp/ttnn/run_operation.cpp
index 66237aa0bc4..93ad0b34a96 100644
--- a/ttnn/cpp/ttnn/run_operation.cpp
+++ b/ttnn/cpp/ttnn/run_operation.cpp
@@ -9,10 +9,10 @@
 
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 #include "ttnn/operation.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tracy/Tracy.hpp>
+#include <tt-metalium/reflection.hpp>
+#include "tools/profiler/op_profiler.hpp"
 #include "ttnn/config.hpp"
 #include "ttnn/device_operation.hpp"
 #include "ttnn/decorators.hpp"
diff --git a/ttnn/cpp/ttnn/run_operation.hpp b/ttnn/cpp/ttnn/run_operation.hpp
index 7ee82b39812..d60384e3d68 100644
--- a/ttnn/cpp/ttnn/run_operation.hpp
+++ b/ttnn/cpp/ttnn/run_operation.hpp
@@ -9,8 +9,8 @@
 
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 #include "ttnn/operation.hpp"
-#include "tt_stl/concepts.hpp"
-#include "tt_stl/type_name.hpp"
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/type_name.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/ttnn/cpp/ttnn/run_operation_inl.hpp b/ttnn/cpp/ttnn/run_operation_inl.hpp
index cf82b06e91f..b67474ee2f5 100644
--- a/ttnn/cpp/ttnn/run_operation_inl.hpp
+++ b/ttnn/cpp/ttnn/run_operation_inl.hpp
@@ -9,10 +9,10 @@
 
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
-#include "tt_metal/tools/profiler/op_profiler.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
-#include "tt_metal/distributed/mesh_device.hpp"
+#include <tracy/Tracy.hpp>
+#include "tools/profiler/op_profiler.hpp"
+#include <tt-metalium/reflection.hpp>
+#include <tt-metalium/mesh_device.hpp>
 #include <type_traits>
 #include <optional>
 
diff --git a/ttnn/cpp/ttnn/tensor/host_buffer/functions.hpp b/ttnn/cpp/ttnn/tensor/host_buffer/functions.hpp
index d9e195131ff..4ed5397efee 100644
--- a/ttnn/cpp/ttnn/tensor/host_buffer/functions.hpp
+++ b/ttnn/cpp/ttnn/tensor/host_buffer/functions.hpp
@@ -8,7 +8,7 @@
 
 #include "ttnn/tensor/host_buffer/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/reflection.hpp>
 
 namespace tt {
 
diff --git a/ttnn/cpp/ttnn/tensor/layout/page_config.hpp b/ttnn/cpp/ttnn/tensor/layout/page_config.hpp
index 0661ede6227..8adacb20629 100644
--- a/ttnn/cpp/ttnn/tensor/layout/page_config.hpp
+++ b/ttnn/cpp/ttnn/tensor/layout/page_config.hpp
@@ -9,7 +9,7 @@
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/tensor/enum_types.hpp"
 
-#include "impl/tile/tile.hpp"
+#include <tt-metalium/tile.hpp>
 
 #include "alignment.hpp"
 #include "size.hpp"
diff --git a/ttnn/cpp/ttnn/tensor/shape/shape.cpp b/ttnn/cpp/ttnn/tensor/shape/shape.cpp
index 8d486e94055..126d101cd35 100644
--- a/ttnn/cpp/ttnn/tensor/shape/shape.cpp
+++ b/ttnn/cpp/ttnn/tensor/shape/shape.cpp
@@ -7,7 +7,7 @@
 #include <numeric>
 #include <ostream>
 #include "ttnn/tensor/shape/small_vector.hpp"
-#include "tt_metal/common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/ttnn/cpp/ttnn/tensor/shape/shape_base.cpp b/ttnn/cpp/ttnn/tensor/shape/shape_base.cpp
index 0ceed219709..687c2312f94 100644
--- a/ttnn/cpp/ttnn/tensor/shape/shape_base.cpp
+++ b/ttnn/cpp/ttnn/tensor/shape/shape_base.cpp
@@ -5,7 +5,7 @@
 #include "shape_base.hpp"
 #include <stdexcept>
 #include "fmt/color.h"
-#include "tt_metal/common/assert.hpp"
+#include <tt-metalium/assert.hpp>
 
 namespace tt::tt_metal {
 
diff --git a/ttnn/cpp/ttnn/tensor/shape/small_vector.hpp b/ttnn/cpp/ttnn/tensor/shape/small_vector.hpp
index 90a984ae656..41e71c9792a 100644
--- a/ttnn/cpp/ttnn/tensor/shape/small_vector.hpp
+++ b/ttnn/cpp/ttnn/tensor/shape/small_vector.hpp
@@ -6,7 +6,7 @@
 
 #include <boost/container/small_vector.hpp>
 
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/reflection.hpp>
 
 #if TTNN_WITH_PYTHON_BINDINGS
 #include <pybind11/stl.h>
diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp
index e6c9bf04e10..0284b10e1bc 100644
--- a/ttnn/cpp/ttnn/tensor/tensor.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor.cpp
@@ -8,23 +8,23 @@
 #include <memory>
 #include <utility>
 
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "impl/buffers/buffer_constants.hpp"
-#include "tt_metal/tt_stl/overloaded.hpp"
+#include <tt-metalium/assert.hpp>
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/buffer_constants.hpp>
+#include <tt-metalium/overloaded.hpp>
 #include "ttnn/tensor/tensor_ops.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/tensor/tensor_impl_wrapper.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/tensor/tensor_ops.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/distributed/mesh_device.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
-#include "tt_metal/graph/graph_tracking.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/mesh_device.hpp>
+#include <tracy/Tracy.hpp>
+#include <tt-metalium/graph_tracking.hpp>
 #include "ttnn/core.hpp"
 #include "ttnn/tensor/tensor_ops.hpp"
 #include "ttnn/tensor/layout/tensor_layout.hpp"
@@ -653,7 +653,7 @@ Tensor Tensor::from_span<float>(
             return tensor;
         }
         default: {
-            TT_THROW("Unsupported data type for from_span<float>: {}", spec.data_type());
+            TT_THROW("Unsupported data type: {}", spec.data_type());
         }
     }
 }
@@ -665,12 +665,39 @@ Tensor Tensor::from_span(tt::stl::Span<const T> buffer, const TensorSpec& spec,
         buffer.size() == volume, "Current buffer size is {} different from shape volume {}", buffer.size(), volume);
     TT_FATAL(
         spec.data_type() == convert_to_data_type<T>(),
-        "Unsupported data type for from_span: got {}, expected: {}",
+        "Unsupported data type: got {}, expected: {}",
         spec.data_type(),
         convert_to_data_type<T>());
     return create_owned_tensor_from_row_major_data(std::vector<T>(buffer.begin(), buffer.end()), spec, device);
 }
 
+template <>
+Tensor Tensor::from_vector<float>(
+    std::vector<float>&& buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device) {
+    size_t volume = spec.logical_shape().volume();
+    TT_FATAL(
+        buffer.size() == volume, "Current buffer size is {} different from shape volume {}", buffer.size(), volume);
+    if (spec.data_type() == DataType::FLOAT32) {
+        // User `buffer` directly, when no type conversion is needed.
+        return create_owned_tensor_from_row_major_data(std::move(buffer), spec, device);
+    } else {
+        return from_span(tt::stl::Span<const float>(buffer.data(), buffer.size()), spec, device);
+    }
+}
+
+template <typename T>
+Tensor Tensor::from_vector(std::vector<T>&& buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device) {
+    size_t volume = spec.logical_shape().volume();
+    TT_FATAL(
+        buffer.size() == volume, "Current buffer size is {} different from shape volume {}", buffer.size(), volume);
+    TT_FATAL(
+        spec.data_type() == convert_to_data_type<T>(),
+        "Unsupported data type: got {}, expected: {}",
+        spec.data_type(),
+        convert_to_data_type<T>());
+    return create_owned_tensor_from_row_major_data(std::move(buffer), spec, device);
+}
+
 template <>
 std::vector<float> Tensor::to_vector<float>() const {
     Tensor cpu_tensor = this->cpu();
@@ -719,6 +746,16 @@ template Tensor Tensor::from_span<uint16_t>(
     tt::stl::Span<const uint16_t> buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device);
 template Tensor Tensor::from_span<uint32_t>(
     tt::stl::Span<const uint32_t> buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device);
+template Tensor Tensor::from_vector<bfloat16>(
+    std::vector<bfloat16>&& buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device);
+template Tensor Tensor::from_vector<int32_t>(
+    std::vector<int32_t>&& buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device);
+template Tensor Tensor::from_vector<uint8_t>(
+    std::vector<uint8_t>&& buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device);
+template Tensor Tensor::from_vector<uint16_t>(
+    std::vector<uint16_t>&& buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device);
+template Tensor Tensor::from_vector<uint32_t>(
+    std::vector<uint32_t>&& buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device);
 
 template std::vector<bfloat16> Tensor::to_vector<bfloat16>() const;
 template std::vector<int32_t> Tensor::to_vector<int32_t>() const;
diff --git a/ttnn/cpp/ttnn/tensor/tensor.hpp b/ttnn/cpp/ttnn/tensor/tensor.hpp
index f331f663c9b..8cabd3e622d 100644
--- a/ttnn/cpp/ttnn/tensor/tensor.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor.hpp
@@ -10,21 +10,21 @@
 #include <variant>
 #include <vector>
 
-#include "common/bfloat16.hpp"
-#include "common/bfloat4.hpp"
-#include "common/bfloat8.hpp"
-#include "common/test_tiles.hpp"
-#include "common/tt_backend_api_types.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/bfloat4.hpp>
+#include <tt-metalium/bfloat8.hpp>
+#include <tt-metalium/test_tiles.hpp>
+#include <tt-metalium/tt_backend_api_types.hpp>
 #include "ttnn/any_device.hpp"
 #include "ttnn/common/constants.hpp"
 #include "ttnn/distributed/distributed_tensor_config.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/tensor/tensor_spec.hpp"
 #include "ttnn/tensor/layout/tensor_layout.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/tile/tile.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/tile.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/reflection.hpp>
 #include "types.hpp"
 
 namespace tt {
@@ -157,13 +157,19 @@ struct Tensor {
     static Tensor from_span(
         tt::stl::Span<const T> buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device = std::nullopt);
 
-    // Same as `from_span`, but takes a vector instead.
+    // Same as `from_span`, but operates on a vector instead.
     template <typename T>
     static Tensor from_vector(
         const std::vector<T>& buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device = std::nullopt) {
-        return from_span(tt::stl::Span<const T>(buffer.data(), buffer.size()), spec, device);
+        return from_span(tt::stl::Span<const T>(buffer), spec, device);
     }
 
+    // Same as `from_vector`, but takes in an rvalue. No copies will be made, if the target layout is row-major, and no
+    // type conversion is needed.
+    template <typename T>
+    static Tensor from_vector(
+        std::vector<T>&& buffer, const TensorSpec& spec, std::optional<ttnn::AnyDevice> device = std::nullopt);
+
     // Converts a `Tensor` to a `std::vector<T>`.
     // Elements in the vector will be stored in row-major order. The type of the requested vector has to match that of
     // the `Tensor`; block float formats such as BFLOAT8_B and BFLOAT4_B require `T` equal `float`.
diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp
index 743d1cf7a98..ab425de475b 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp
@@ -6,18 +6,18 @@
 #include <cstdint>
 #include <optional>
 
-#include "common/bfloat4.hpp"
-#include "common/bfloat8.hpp"
+#include <tt-metalium/bfloat4.hpp>
+#include <tt-metalium/bfloat8.hpp>
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/tensor/layout/tensor_layout.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
-#include "tt_stl/concepts.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/command_queue.hpp>
+#include <tracy/Tracy.hpp>
+#include <tt-metalium/device_impl.hpp>
 
 namespace tt {
 
diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
index ea0bdd704d3..d85e42dd75f 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
@@ -9,15 +9,15 @@
 #include <cstdint>
 #include <memory>
 
-#include "common/bfloat16.hpp"
+#include <tt-metalium/bfloat16.hpp>
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/tensor/tensor_impl_wrapper.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "ttnn/tensor/types.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/common/math.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
-#include "tt_metal/graph/graph_tracking.hpp"
+#include <tt-metalium/constants.hpp>
+#include <tt-metalium/math.hpp>
+#include <tracy/Tracy.hpp>
+#include <tt-metalium/graph_tracking.hpp>
 #include "ttnn/distributed/api.hpp"
 #include "ttnn/distributed/types.hpp"
 #include "ttnn/core.hpp"
diff --git a/ttnn/cpp/ttnn/tensor/types.hpp b/ttnn/cpp/ttnn/tensor/types.hpp
index 640bb8383f1..93600c05df3 100644
--- a/ttnn/cpp/ttnn/tensor/types.hpp
+++ b/ttnn/cpp/ttnn/tensor/types.hpp
@@ -11,16 +11,16 @@
 #include <vector>
 #include <algorithm>
 
-#include "common/bfloat16.hpp"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/device.hpp"
-#include "tt_metal/tt_stl/concepts.hpp"
-#include "tt_metal/tt_stl/reflection.hpp"
-#include "tt_metal/tt_stl/span.hpp"
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/buffer.hpp>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/reflection.hpp>
+#include <tt-metalium/span.hpp>
 #include "ttnn/distributed/distributed_tensor_config.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
-#include "ttnn/cpp/ttnn/tensor/enum_types.hpp"
+#include "cpp/ttnn/tensor/enum_types.hpp"
 
 #include "ttnn/tensor/shape/shape.hpp"
 
diff --git a/ttnn/cpp/ttnn/types.hpp b/ttnn/cpp/ttnn/types.hpp
index e541185f661..740b3db00ff 100644
--- a/ttnn/cpp/ttnn/types.hpp
+++ b/ttnn/cpp/ttnn/types.hpp
@@ -4,11 +4,11 @@
 
 #pragma once
 
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/allocator/allocator.hpp"
-#include "tt_metal/impl/buffers/global_circular_buffer.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/allocator.hpp>
+#include <tt-metalium/global_circular_buffer_impl.hpp>
+#include <tt-metalium/global_semaphore.hpp>
+#include <tt-metalium/sub_device.hpp>
 #include "ttnn/distributed/types.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
diff --git a/tt_metal/tools/profiler/op_profiler.hpp b/ttnn/tools/profiler/op_profiler.hpp
similarity index 99%
rename from tt_metal/tools/profiler/op_profiler.hpp
rename to ttnn/tools/profiler/op_profiler.hpp
index 681e763e167..afb2cafc78b 100644
--- a/tt_metal/tools/profiler/op_profiler.hpp
+++ b/ttnn/tools/profiler/op_profiler.hpp
@@ -12,11 +12,11 @@
 #include "ttnn/tensor/tensor.hpp"
 #include <nlohmann/json.hpp>
 #include <magic_enum/magic_enum.hpp>
-#include "tools/profiler/profiler.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
+#include <tt-metalium/profiler.hpp>
+#include <tt-metalium/kernel.hpp>
 #include "ttnn/operation.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/device_pool.hpp>
 #include "tracy/Tracy.hpp"
 #include "tracy/TracyC.h"
 
diff --git a/ttnn/ttnn/library_tweaks.py b/ttnn/ttnn/library_tweaks.py
index 3b1f6f0128d..0c1a97f532e 100644
--- a/ttnn/ttnn/library_tweaks.py
+++ b/ttnn/ttnn/library_tweaks.py
@@ -75,25 +75,7 @@ def _is_non_existent_or_empty_env_var(env_var_name):
 
 
 def _setup_env(ttnn_package_path, cwd):
-    # We are heuristically determinining that we installed from a wheel by
-    # checking whether or not we have these environment variables. Only devs
-    # working from source should be using these environment variables.
-    # Otherwise, we set them dynamically here.
-
-    is_wheel_installation = _is_non_existent_or_empty_env_var("ARCH_NAME") and _is_non_existent_or_empty_env_var(
-        "TT_METAL_HOME"
-    )
-
-    if is_wheel_installation:
-        arch_name_file = ttnn_package_path / ".ARCH_NAME"
-
-        assert (
-            arch_name_file.is_file()
-        ), f".ARCH_NAME is not a file, so architecture cannot be determined. Are you installing ttnn from source? If you are installing and running from source, please set the ARCH_NAME environment variable."
-
-        with open(arch_name_file) as f:
-            os.environ["ARCH_NAME"] = f.readline().strip()
-
+    if _is_non_existent_or_empty_env_var("TT_METAL_HOME"):
         # Workaround: treat cwd / ttnn_links as TT_METAL_HOME and copy/symlink assets to it
         metal_home = cwd / ".ttnn_runtime_artifacts"
         prepare_dir_as_metal_home(ttnn_package_path, metal_home)