diff --git a/.github/actions/build-toolchain/action.yml b/.github/actions/build-toolchain/action.yml
index ee75b1e9d3..6fe6cc3444 100644
--- a/.github/actions/build-toolchain/action.yml
+++ b/.github/actions/build-toolchain/action.yml
@@ -4,6 +4,10 @@ inputs:
   os:
     description: 'Operating System'
     required: true
+  sdk:
+    description: 'MacOS SDK, if applicable'
+    required: true
+    default: '0'
 runs:
   using: 'composite'
   steps:
@@ -18,7 +22,7 @@ runs:
       uses: actions/cache@v4
       with:
         path: /opt/ttmlir-toolchain
-        key: ${{ inputs.os }}-ttmlir-toolchain-${{ hashFiles('env/**') }}
+        key: ${{ inputs.os }}-ttmlir-toolchain-${{ hashFiles('env/**') }}-${{ inputs.sdk }}
 
     - name: 'Build ttmlir-toolchain'
       if: steps.cache-toolchain.outputs.cache-hit != 'true'
diff --git a/.github/workflows/macos-build.yml b/.github/workflows/macos-build.yml
index 43dd21370c..82c464244d 100644
--- a/.github/workflows/macos-build.yml
+++ b/.github/workflows/macos-build.yml
@@ -4,6 +4,9 @@ on:
   workflow_dispatch:
   workflow_call:
 
+env:
+  SDK_VERSION: "0"
+
 jobs:
   build:
     strategy:
@@ -23,16 +26,23 @@ jobs:
       with:
         os: ${{ matrix.build.runs-on }}
 
+    - name: Get macos sdk version
+      if: startsWith(matrix.build.runs-on, 'macos')
+      shell: bash
+      run: |
+        echo "SDK_VERSION=$(xcrun --show-sdk-version)" >> $GITHUB_ENV
+
     - name: Build and cache ttmlir-toolchain
       uses: ./.github/actions/build-toolchain
       with:
         os: ${{ matrix.build.runs-on }}
+        sdk: ${{ env.SDK_VERSION }}
 
     - name: ccache
       uses: hendrikmuhs/ccache-action@v1.2
       with:
         create-symlink: true
-        key: ${{ matrix.build.runs-on }}-runtime-${{ matrix.build.enable_runtime }}
+        key: ${{ matrix.build.runs-on }}-runtime-${{ matrix.build.enable_runtime }}-${{ env.SDK_VERSION }}
 
     - name: Set reusable strings
       id: strings
@@ -72,6 +82,12 @@ jobs:
         source env/activate
         cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build.build_type }} -- check-ttmlir
 
+    - name: Build ttrt
+      shell: bash
+      run: |
+        source env/activate
+        cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build.build_type }} -- ttrt
+
     - name: Upload Test Report
       uses: actions/upload-artifact@v4
       with:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1226581d9..374a31d442 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,8 @@ if (NOT DEFINED ENV{TTMLIR_ENV_ACTIVATED})
   message(FATAL_ERROR "tt-mlir environment not activated. Please run 'source env/activate'.")
 endif()
 
+option(TT_RUNTIME_ENABLE_PERF_TRACE "Enable performance mode" OFF)
+
 set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
 
 set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to conform to")
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index 7452829c41..1520f5faf2 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_custom_target(copy-docs-dir
   COMMAND
-    cp -r ${CMAKE_CURRENT_SOURCE_DIR}/ ${CMAKE_CURRENT_BINARY_DIR}
+    cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
 )
 
 add_custom_target(autogen-summary
diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md
index 09036280a6..29eec9d1f7 100644
--- a/docs/src/SUMMARY.md
+++ b/docs/src/SUMMARY.md
@@ -25,3 +25,4 @@
   - [Runtime Stitching](./specs/runtime-stitching.md)
   - [Tensor Layout](./specs/tensor-layout.md)
     - [TTNN Interactive Visualizer](./specs/tensor-layout-interactive.md)
+  - [Device](./specs/device.md)
diff --git a/docs/src/adding-an-op.md b/docs/src/adding-an-op.md
index 883b4e5069..0eea465126 100644
--- a/docs/src/adding-an-op.md
+++ b/docs/src/adding-an-op.md
@@ -7,13 +7,19 @@ reference the diff alongside this guide to see the changes in full.
 
 This guide will cover the following steps:
 
-1. [Define the Op in the TTIR frontend dialect](#1-define-the-op-in-the-ttir-frontend-dialect)
-2. [Define the Op in the TTNN backend dialect](#2-define-the-op-in-the-ttnn-backend-dialect)
-3. [Convert / Implement the Op in the TTNN passes](#3-convert--implement-the-op-in-the-ttnn-passes)
-4. [Add a unit test for the Op](#4-add-a-unit-test-for-the-op)
-5. [Define flatbuffer schema for the Op](#5-define-flatbuffer-schema-for-the-op)
-6. [Serialize the Op in the flatbuffer format](#6-serialize-the-op-in-the-flatbuffer-format)
-7. [Add runtime support for the Op](#7-add-runtime-support-for-the-op)
+- [Adding an Op](#adding-an-op)
+  - [1. Define the Op in the TTIR frontend dialect](#1-define-the-op-in-the-ttir-frontend-dialect)
+  - [2. Define the Op in the TTNN backend dialect](#2-define-the-op-in-the-ttnn-backend-dialect)
+      - [`TTNNOps.td`](#ttnnopstd)
+      - [`TTNNOps.cpp`](#ttnnopscpp)
+  - [3. Convert / Implement the Op in the TTNN passes](#3-convert--implement-the-op-in-the-ttnn-passes)
+  - [4. Add a unit test for the Op](#4-add-a-unit-test-for-the-op)
+      - [`test/ttmlir/Dialect/TTNN/simple_matmul.mlir`](#testttmlirdialectttnnsimple_matmulmlir)
+  - [5. Define flatbuffer schema for the Op](#5-define-flatbuffer-schema-for-the-op)
+      - [`include/ttmlir/Target/TTNN/program.fbs`](#includettmlirtargetttnnprogramfbs)
+  - [6. Serialize the Op in the flatbuffer format](#6-serialize-the-op-in-the-flatbuffer-format)
+  - [7. Add runtime support for the Op](#7-add-runtime-support-for-the-op)
+      - [`runtime/lib/ttnn/program.cpp`](#runtimelibttnnprogramcpp)
 
 ## 1. Define the Op in the TTIR frontend dialect
 
@@ -99,13 +105,13 @@ section for details, the process is the same.
 Next we will implement the conversion from the TTIR `matmul` Op to the TTNN `matmul` Op.
 This is a trivial conversion, as the Ops are identical in their semantics, so
 the changeset isn't going to be very instructive, but will at least point to the
-files involved. The conversion is implemented in the `ConvertTTIRToTNN` pass in
-file `lib/Dialect/TTNN/Transforms/Passes.cpp`.
+files involved. The conversion is implemented in the `ConvertTTIRToTTNNPass` pass in
+file `lib/Conversion/TTIRToTTNN/TTIRToTTNNPass.cpp`.
 
-Zooming into `class ConvertTTIRToTNN` we can see we implement the pass interface
+Zooming into `class ConvertTTIRToTTNNPass` we can see we implement the pass interface
 via member function `void runOnOperation() final`.  This function will be called
 for every operation matching the type specified in the pass tablegen file. A
-quick look at `include/ttmlir/Dialect/TTNN/Passes.td` we can see:
+quick look at `include/ttmlir/Conversion/Passes.td` we can see:
 
 ```
 def ConvertTTIRToTTNN: Pass<"convert-ttir-to-ttnn", "::mlir::ModuleOp"> {
@@ -121,22 +127,21 @@ can match much more complicated patterns (nested inside of the `ModuleOp`'s
 than just a single operation.
 
 ```cpp
-{{#include ../../../lib/Dialect/TTNN/Transforms/Passes.cpp:adding_an_op_matmul_rewrite_pattern_set}}
+{{#include ../../../lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp:adding_an_op_matmul_rewrite_pattern_set}}
 ```
 
-> More information on rewrite patterns and their capabilities can be found in the [MLIR
-> documentation](https://mlir.llvm.org/docs/PatternRewriter/).
+> More information on rewrite patterns and their capabilities can be found in the MLIR documentation [here](https://mlir.llvm.org/docs/PatternRewriter/) and [here](https://mlir.llvm.org/docs/DialectConversion/).
 
-For matmul, we defined a new pattern rewriter that's generic to all binary ops
+For matmul, we defined a new conversion pattern that's generic to all binary ops
 with arguments named `a` and `b`:
 
 ```cpp
-{{#include ../../../lib/Dialect/TTNN/Transforms/Passes.cpp:adding_an_op_matmul_op_rewriter}}
+{{#include ../../../lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp:adding_an_op_matmul_op_rewriter}}
 ```
 
 Invoked as part of the rewrite set:
 ```cpp
-TTIRToTTNNBinaryOpRewriter<ttir::MatmulOp, MatmulOp>
+MatmulOpConversionPattern
 ```
 
 We also need to add this op to the C++ emitter,
@@ -149,7 +154,7 @@ So far we have defined the Op in the TTIR and TTNN dialects,
 implemented verifiers, and have conversion passes.  Now we need to add a unit
 test to ensure that the pass is working correctly.  The unit tests are located
 in `test/ttmlir/Dialect` area.  In this case we'll add a test under the `TTNN`
-subdirectory since we are testing the `ConvertTTIRToTTNN` pass.
+subdirectory since we are testing the `ConvertTTIRToTTNNPass`.
 
 #### `test/ttmlir/Dialect/TTNN/simple_matmul.mlir`
 
@@ -215,11 +220,11 @@ to a program called `flatc` which generates C++ code (or any language for that
 matter) for serializing and deserializing the schema. This generated code can be
 found in `build/include/ttmlir/Target/TTNN/program_generated.h`.
 
-Let's head over to `lib/Dialect/TTNN/Transforms/TTNNToSerializedBinary.cpp` to define
+Let's head over to `lib/Target/TTNN/TTNNToFlatbuffer.cpp` to define
 a `createOp` overloaded function that does the conversion from MLIR to flatbuffer:
 
 ```cpp
-{{#include ../../../lib/Dialect/TTNN/Transforms/TTNNToSerializedBinary.cpp:adding_an_op_matmul_serialize_to_binary}}
+{{#include ../../../lib/Target/TTNN/TTNNToFlatbuffer.cpp:adding_an_op_matmul_serialize_to_binary}}
 ```
 
 Lots of things are happening here, let's break it down:
@@ -241,7 +246,7 @@ Lots of things are happening here, let's break it down:
 
 We can finally generate a binary with our new Op!  We can use the following command:
 ```bash
-./build/bin/ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn --ttnn-serialize-to-binary="output=out.ttnn" test/ttmlir/Dialect/TTNN/simple_matmul.mlir
+./build/bin/ttmlir-opt --ttir-to-ttnn-backend-pipeline test/ttmlir/Dialect/TTNN/simple_matmul.mlir | ./build/bin/ttmlir-translate --ttnn-to-flatbuffer -o out.ttnn
 ```
 
 And we can inspect the with [`ttrt`](./ttrt.md):
diff --git a/docs/src/build.md b/docs/src/build.md
index 4d3bbae99d..b075e54aca 100644
--- a/docs/src/build.md
+++ b/docs/src/build.md
@@ -33,6 +33,7 @@ cmake --build build
 ```
 
 > - To enable the ttnn/metal runtime add `-DTTMLIR_ENABLE_RUNTIME=ON`
+> - To enable the ttnn/metal perf runtime add `-DTT_RUNTIME_ENABLE_PERF_TRACE=ON` and `export ENABLE_TRACY=1` to environment before building
 > - To accelerate the builds with ccache use `-DCMAKE_CXX_COMPILER_LAUNCHER=ccache`
 > - To accelerate builds further, if python bindings aren't needed, `-DTTMLIR_ENABLE_BINDINGS_PYTHON=OFF`. For some reason the python bindings link step is very slow.
 > - TTNN build is automatically integrated / handled by tt-mlir cmake build system.  For debugging and further information regarding the TTNN backend build step, please refer to [TTNN Documentation](https://tenstorrent.github.io/tt-metal/latest/ttnn/ttnn/installing.html).
@@ -45,11 +46,11 @@ cmake --build build
 >   For more information, please refer to
 >   [TT-NN and TT-Metailium installation documentation](https://tenstorrent.github.io/tt-metal/latest/ttnn/ttnn/installing.html#step-4-install-and-start-using-tt-nn-and-tt-metalium).
 
-| OS | Offline Compiler Only | Runtime Enabled Build |
-|----|-----------------------|-----------------------|
-| Ubuntu 22.04  | ✅ | ❌ |
-| Ubuntu 20.04  | ✅ | ✅ |
-| MacOS         | ✅ | ❌ |
+| OS | Offline Compiler Only | Runtime Enabled Build | Runtime + Perf Enabled Build |
+|----|-----------------------|-----------------------| -----------------------------|
+| Ubuntu 22.04  | ✅ | ❌ | ❌ |
+| Ubuntu 20.04  | ✅ | ✅ | ✅ |
+| MacOS         | ✅ | ❌ | ❌ |
 
 ## Test
 
@@ -104,10 +105,11 @@ For more information visit [pre-commit](https://pre-commit.com/)
 ```bash
 source env/activate
 cmake --build build -- docs
-mdbook serve build/docs/book
+mdbook serve build/docs
 ```
 
 > - `mdbook` can be installed with the system's package manager.
+> - `mdbook serve` will by default create a local server at `http://localhost:3000`.
 
 ## Dependencies
 
@@ -119,6 +121,15 @@ Make sure to have Git LFS installed. You can install it with the following comma
 sudo apt-get install git-lfs
 ```
 
+If you are building performance trace with `-DTT_RUNTIME_ENABLE_PERF_TRACE=ON`, you will have to install the following packages
+
+```bash
+pip install loguru
+pip install torch
+pip install pandas
+pip install seaborn
+```
+
 ### Ubuntu 22.04
 
 We need to install Ninja which can be done with the following command
diff --git a/docs/src/dialects-overview.md b/docs/src/dialects-overview.md
index 9856655cf9..e886fb90c1 100644
--- a/docs/src/dialects-overview.md
+++ b/docs/src/dialects-overview.md
@@ -6,7 +6,7 @@ individual dialect documentation for more details.:
 - `tt`: Common types such as, `tt.tile`, `tt.layout`, `tt.grid`, etc. and enums such as, data formats, memory spaces, iterator types etc.
 - `ttir`: A high level dialect that models the tensor compute graph on tenstorrent devices. Accepts `tosa` and `linalg` input.
   - `ttir.generic`: Generically describe compute work.
-  - `ttir.layout`: Convert between different tensor memory layouts and transfer between different memory spaces.
+  - `ttir.to_layout`: Convert between different tensor memory layouts and transfer between different memory spaces.
   - `tensor.pad`: Pad a tensor with a value (ie. convs)
   - `ttir.yield`: return result memref of computation in dispatch region body, lowers to `ttkernel.yield`
   - `ttir.kernel`: lowers to some backend kernel
diff --git a/docs/src/specs/device.md b/docs/src/specs/device.md
new file mode 100644
index 0000000000..ac0e490e4d
--- /dev/null
+++ b/docs/src/specs/device.md
@@ -0,0 +1,295 @@
+# Device
+
+Device in tt-mlir is somewhat of an overloaded term and can refer to different
+things depending on the context. This document will only speak to the compiler's
+abstract representation of a device captured by attribute `#tt.device`.
+
+## Terms
+
+There are many overloaded terms when talking about devices and grids, this
+document will use the following definitions:
+
+- **Physical Grid**: A 2D array of tensix cores on a chip.
+- **Chip**: A single physical chip with a **Physical Grid** of cores.
+- **Card**: A PCIE or Ethernet card that may contain multiple **Chips**.
+- **System**: A collection of **Cards** that are usually connected together on the
+  same host via PCIE or networked via ethernet.  A system is represented by
+  `SystemDesc` in the compiler.
+- **Device**: Device is always presented as a single entity to the enclosing
+  scope, but it may be virtualized to abstract a multi-card **System** and
+  part of its encoding carries a **Logical Grid**. Another way to think of device
+  is a view over the system.
+- **Logical Grid** or just **Grid**: Is a logical shape that abstracts one or
+  more **Physical Grids**.
+
+## Motivation
+
+The device attribute strives to achieve the following goals:
+- Provide a convenient representation of a physical grid that decouples the
+  logical division of tensors from the physical layout of the hardware. This not
+  only simplifies reasoning about how tensors get divided into shards, but can also
+  enable reinterpretations of the device grid for data layout optimization decoupled
+  from the existing encoding of the tensor layouts.
+- Following the first point, the device attribute should be able to represent
+  many different forms of logical grids, from simple 2D grids, to more complex
+  topologies like extra-wide grids or higher dimensional grids.
+- Device attribute captures encoding both single chip and multi-chip systems
+  under a single, virtualized representation.
+- Enable many forms of data parallel execution strategies for single and
+  multi chip systems under a single representation.
+
+## Examples
+
+All of the following examples will assume the physical hardware has an 8x8 physical
+grid of cores.  We will use notation `[N, 8x8]` to represent a `N` chip system,
+each with an 8x8 physical grid.
+
+`#tt.device` in is simplest, single chip form `[1, 8x8]`, just maps directly 1-1 to the
+underlying physical hardware device.
+
+```mlir
+#tt.device<#tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, [0]>
+```
+
+Let's break down what each of these attributes mean:
+- `#tt.grid<8x8, (d0, d1) -> (0, d0, d1)>`: This is a 2D logical grid with dim 8x8.
+  It's followed by an affine map `(d0, d1) -> (0, d0, d1)` that provides a mapping
+  from the logical grid to the physical grid.  In this case, the logical grid is the same
+  as the physical grid, so the mapping is the identity function. The logical
+  grid can have any rank, but the physical mapping is always 3D, with the first
+  being the chip index, followed by the 2D physical core index within the chip.
+- `[0]`: This is a list of chip indices.  These chip indices directly reference
+  the same chip indices in the system descriptor. The `SystemDesc` attribute
+  that this is in reference to is tagged on the top level `ModuleOp`.
+
+Specific examples that this document will cover:
+- [Data Parallel Over Batch](#data-parallel-over-batch)
+- [Data Parallel Over 2d](#data-parallel-over-2d)
+- [Data Parallel Over 2d and Batch](#data-parallel-over-2d-and-batch)
+- [Pipeline Parallel](#pipeline-parallel)
+- [Reinterpreted Grids (Transpose)](#reinterpreted-grids-transpose)
+- [Reinterpreted Grids (Training Usecase)](#reinterpreted-grids-training-usecase)
+- [Reinterpreted Grids (Extra)](#reinterpreted-grids-extra)
+
+> Before we move on to more complex examples, it's worth having on hand:
+> - The python test `test/python/device_attr.py` which shows how all of these examples
+>   can actually be programmed for the device attribute.
+> - The [Tensor Layout](./tensor-layout.md) spec as the following examples
+>   will demonstrate how tensor layout interacts with the logical device grid.
+
+> **Note on Data Parallel**: There is existing literature that explicitly distinguishes
+> between data parallel and tensor parallel, oftentimes describing data parallel
+> as duplicating the model across multiple devices and trivially dividing up the batch
+> whereas tensor parallel refers to tensor data being distributed and potentially
+> communicated between devices during execution.  While this is true for multi-GPU/CPU
+> systems, it is somewhat of an implementation detail and given the flexibility
+> of tenstorrent hardware there is an opportunity to generalize this concept. In this
+> document we will use the term data parallel to refer to any form of parallelism that
+> divides any dimension of the tensor across multiple cores/chips.
+
+> **Note on Constraints**: Many of the examples below require careful virtualization
+> of the underlying physical system, i.e. some device configurations might
+> only work if the chips are connected via ethernet and with a particular
+> topology, but these constraints are
+> outside the scope of the examples and will be discussed further in the
+> [Backend Lowering and Constraints](#backend-lowering-and-constraints) section.
+
+### Data Parallel Over Batch
+
+Given a 2 chip system, `[2, 8x8]`, we can represent a simple data parallel
+logical grid that divides the batch dimension in half across the two chips.
+
+```mlir
+#tt.device<#tt.grid<2x8x8, (d0, d1, d2) -> (d0, d1, d2)>, [0, 1]>
+```
+
+The affine map here is just identity, so dims `d1` and `d2` directly index
+the physical grid and `d0` indexes the chip.
+
+Now we can consider some tensor that, importantly, has a grid of the same rank as
+the logical device grid:
+
+```mlir
+tensor<16x3x64x128xf32,
+  #tt.layout<(d0, d1, d2, d3) -> (d0, d1 * 64 + d2, d3),
+    undef,
+    <2x2x4>,
+    memref<8x3x1x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space<l1>>
+  >
+>
+```
+
+If we map this tensor onto the above device, it will span across both chips,
+half of the batch dimension on each chip.  Within each chip the tensor occupies
+a 2x4 grid out of the 8x8 physical grid available.
+
+### Data Parallel Over 2d
+
+In this example we will consider a 2 chip system, `[2, 8x8]`, and view it as
+though the two chips are concatenated together side by side to form a single
+`8x16` grid.
+
+```mlir
+#tt.device<#tt.grid<8x16, (d0, d1) -> ((d0 floordiv 8) * 2 + d1 floordiv 8, d0, d1 mod 8)>, [0, 1]>
+```
+
+Here we can see that the affine map encodes an indexing pattern such that when
+we extend past 8 cores in the second dimension, we wrap around to the next chip.
+
+Now we can consider some tensor that, importantly, has a grid of the same rank as
+the logical device grid:
+
+```mlir
+tensor<256x1024xf32,
+  #tt.layout<(d0, d1) -> (d0, d1),
+    undef,
+    <4x16>,
+    memref<2x2x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space<l1>>
+  >
+>
+```
+
+This single tensor maps trivially onto the logical grid, spanning the upper
+half. Decoupled from the tensor's layout, under the hood the tensor is actually
+physically spanning across two chips.
+
+### Data Parallel Over 2d and Batch
+
+The previous 2 examples can be composed together to form a logical grid that
+divides tensor across multiple dimensions.  Here we will consider a 4 chip
+system `[4, 8x8]` and view it as a `2x8x16` grid.
+
+```mlir
+#tt.device<#tt.grid<2x8x16, (d0, d1, d2) -> (d0 * 2 + (d1 floordiv 8) * 2 + d2 floordiv 8, d1, d2 mod 8)>, [0, 1, 2, 3]>
+```
+
+We can evaluate the affine map to see that the chips are interpreted in chunks of
+two, where groups `[0, 1]` and `[2, 3]` each form 8x16 grids and these 2 groups
+concatenate to form a 2x8x16 grid.
+
+We can consider the following tensor to map onto this grid:
+
+```mlir
+tensor<64x256x1024xf32,
+  #tt.layout<(d0, d1) -> (d0, d1),
+    undef,
+    <2x4x16>,
+    memref<32x2x2x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space<l1>>
+  >
+>
+```
+
+### Pipeline Parallel
+
+Pipeline parallel in the scope of this spec isn't particularly interesting, it
+is intended to be used in conjunction with the `ttir.pipeline` operation which
+will group sections of the module's operations into groups to form pipeline regions
+and will be covered in a separate spec.
+
+What we can demonstrate here is how we can take multiple non-overlapping views
+of the system descriptor to form distinct virtual devices.
+
+Given an 8 chip system `[8, 8x8]`, we can form two virtual devices that each
+take 4 chips and interpret them differently (though they could take the same
+logical grid).
+
+```mlir
+#tt.device<#tt.grid<2x8x16, (d0, d1, d2) -> (d0 * 2 + (d1 floordiv 8) * 2 + d2 floordiv 8, d1, d2 mod 8)>, [0, 1, 2, 3]>
+#tt.device<#tt.grid<16x16, (d0, d1) -> ((d0 floordiv 8) * 2 + d1 floordiv 8, d0 mod 8, d1 mod 8)>, [4, 5, 6, 7]>
+```
+
+### Reinterpreted Grids (Transpose)
+
+One particularly interesting usecase that logical grids could enable is to
+reinterpret the grid as a form of data layout optimization. For example, if we
+wanted to transpose a tensor, instead of having to move the data around to
+implement transpose, we could instead reinterpret the grid as being transposed,
+leveraging the fact that the relevant data is already located on the correct
+cores/chips.
+
+To keep things simple, let's consider a 1 chip system `[1, 8x8]`, but it's not
+too big a leap to see how this could map to multi-chip where the cost of moving
+data is even higher.
+
+
+Let's also consider a simple (totally contrived) eltwise unary graph:
+
+```python
+a = exp(a)
+aT = transpose(a)
+relu(aT)
+```
+
+1. We'll establish a regular, single chip, identity logical grid:
+```mlir
+#tt.device<#tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, [0]>
+```
+2. Execute `exp`.
+3. We'll reinterpret the grid as transposed:
+```mlir
+#tt.device<#tt.grid<8x8, (d0, d1) -> (0, d1, d0)>, [0]>
+```
+4. _Execute_ `transpose`.  Note that each core only needs to transpose their
+   data locally.  Eventually this could be implemented as a no-op by reindexing
+   the tile visitation order of the successive operation.
+5. Execute `relu`.
+
+It's important to note that we effectively implemented transpose without moving
+data anywhere.
+
+### Reinterpreted Grids (Extra)
+
+For the sake of examples, here's a few more ways of reinterpreting the logical grid.
+
+#### Extra Wide Grid
+```mlir
+#tt.device<#tt.grid<1x64, (d0, d1) -> (0, d0 * 8 + d1 floordiv 8, d1 mod 8)>, [0]>
+```
+
+#### Extra Tall + Transposed Grid
+```mlir
+#tt.device<#tt.grid<64x1, (d0, d1) -> (0, d1 * 8 + d0 floordiv 8, d0 mod 8)>, [0]>
+```
+
+#### Staircase
+```mlir
+#tt.device<#tt.grid<8x8, (d0, d1) -> (0, d0, (d0 + d1) mod 8)>, [0]>
+```
+
+This could be an interesting starting position for data in implementing matmul as a
+systolic array in a ring topology.
+
+## Backend Lowering and Constraints
+
+While the above device attribute encoding is quite flexible, this does not
+necessarily mean the target backend can actually support all of these
+interpretations.  TTNN backend will be relatively constrained to support only
+the specialized grid topologies that are supported by the API.
+
+### TTNN
+
+TODO:
+
+- Multi-device
+- Grid orientation
+- Height / Width sharded
+- TTNN Generic
+
+### TTMetal
+
+In TTMetal dialect we are only constrained by what we've implemented in the
+tt-mlir compiler, this means it is much more flexible and can theoretically
+support any of the grid interpretations above.
+
+## Test Plan
+
+- `test/python/device_attr.py` covers all of the examples above and asserts the
+  IR is correctly generated.
+- Additional functional unit tests will be added as op and runtime support is
+  added.
+
+## Concerns
+
+- `tt.device` is very flexible, but with this flexibility comes the potential
+  for misuse.  It's important that the compiler is able to validate the legal
+  configurations of this attribute for the target backend.
diff --git a/docs/src/ttmlir-opt.md b/docs/src/ttmlir-opt.md
index fd58e57187..fe14d2ff46 100644
--- a/docs/src/ttmlir-opt.md
+++ b/docs/src/ttmlir-opt.md
@@ -2,12 +2,6 @@
 
 The `ttmlir` optimizer driver.  This tool is used to run the `ttmlir` compiler passes on a `.mlir` source files and is central to developing and testing the compiler.
 
-## Generate a flatbuffer file
-
-```bash
-./build/bin/ttmlir-opt --ttir-to-ttnn-backend-pipeline --ttnn-serialize-to-binary="output=out.ttnn" test/ttmlir/Dialect/TTNN/simple_multiply.mlir
-```
-
 ## Simple Test
 
 ```bash
diff --git a/docs/src/ttmlir-translate.md b/docs/src/ttmlir-translate.md
index fa9fd8287a..fa6f0bef50 100644
--- a/docs/src/ttmlir-translate.md
+++ b/docs/src/ttmlir-translate.md
@@ -11,6 +11,15 @@ The `ttmlir-translate` translation utility. Unlike `ttmlir-opt` tool which is us
 ./build/bin/ttmlir-translate -mlir-to-cpp c.mlir -allow-unregistered-dialect
 ```
 
+## Generate flatbuffer file from MLIR
+```bash
+# First run `ttmlir-opt` to convert to proper dialect
+./build/bin/ttmlir-opt --ttir-to-ttnn-backend-pipeline test/ttmlir/Dialect/TTNN/simple_multiply.mlir -o ttnn.mlir
+
+# Now run `ttmlir-translate` to produce flatbuffer file
+./build/bin/ttmlir-translate --ttnn-to-flatbuffer ttnn.mlir -o out.ttnn
+```
+
 Bonus: These two commands can be piped, to avoid writing a `mlir` file to disk, like so:
 ```bash
 ./build/bin/ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn --convert-ttnn-to-emitc test/ttmlir/Dialect/TTNN/simple_multiply.mlir | ./build/bin/ttmlir-translate -mlir-to-cpp -allow-unregistered-dialect
diff --git a/docs/src/ttrt.md b/docs/src/ttrt.md
index b062af9109..c9a0da3522 100644
--- a/docs/src/ttrt.md
+++ b/docs/src/ttrt.md
@@ -16,34 +16,71 @@ ttrt --help
 
 See the [ttmlir-opt](./ttmlir-opt.md) documentation for more information on how to generate a flatbuffer file.
 
-## Read sections from the flatbuffer
+## APIs
+```bash
+ttrt --help
+```
 
+### read
 ```bash
+ttrt read --help
+ttrt read --section mlir out.ttnn
+ttrt read --section cpp out.ttnn
 ttrt read --section version out.ttnn
 ttrt read --section system-desc out.ttnn
-ttrt read out.ttnn # Dump the whole thing as json
-ttrt read --help
+ttrt read --section inputs out.ttnn
+ttrt read --section outputs out.ttnn
+ttrt read --section all out.ttnn
+ttrt read --section all out.ttnn --clean-artifacts
+ttrt read --section all out.ttnn --save-artifacts
+ttrt read --section all /dir/of/flatbuffers
 ```
 
-## Query information about the current system
+### run
+Note: It's required to be on a system with silicon and to have a runtime enabled
+build `-DTTMLIR_ENABLE_RUNTIME=ON`.
+
+```bash
+ttrt run --help
+ttrt run out.ttnn
+ttrt run out.ttnn --clean-artifacts
+ttrt run out.ttnn --save-artifacts
+ttrt run out.ttnn --loops 10
+ttrt run --program-index all out.ttnn
+ttrt run --program-index 0 out.ttnn
+ttrt run /dir/of/flatbuffers
+ttrt run /dir/of/flatbuffers --loops 10
+```
 
+### query
 Note: It's required to be on a system with silicon and to have a runtime enabled
 build `-DTTMLIR_ENABLE_RUNTIME=ON`.
 
 ```bash
+ttrt query --help
 ttrt query --system-desc
-ttrt query --save-system-desc n300.ttsys
+ttrt query --system-desc-as-json
+ttrt query --system-desc-as-dict
+ttrt query --save-artifacts
+ttrt query --clean-artifacts
 ```
 
-## Execute flatbuffer files
-
-Note:
-- It's required to be on a system with silicon and to have a runtime enabled
-build `-DTTMLIR_ENABLE_RUNTIME=ON`.
-- It's required to have installed `torch` in your python environment.
+### perf
+Note: It's required to be on a system with silicon and to have a runtime enabled
+build `-DTTMLIR_ENABLE_RUNTIME=ON`. Also need perf enabled build `-DTT_RUNTIME_ENABLE_PERF_TRACE=ON` with `export ENABLE_TRACY=1`.
 
 ```bash
-ttrt run out.ttnn
+ttrt perf --help
+ttrt perf out.ttnn
+ttrt perf out.ttnn --clean-artifacts
+ttrt perf out.ttnn --save-artifacts
+ttrt perf out.ttnn --loops 10
+ttrt perf --program-index all out.ttnn
+ttrt perf --program-index 0 out.ttnn
+ttrt perf --device out.ttnn
+ttrt perf --generate-params --perf-csv trace.csv
+ttrt perf /dir/of/flatbuffers
+ttrt perf /dir/of/flatbuffers --loops 10
 ```
 
 ## ttrt is written as a python library, so it can be used in custom python scripts
@@ -54,3 +91,6 @@ import ttrt.binary
 fbb = ttrt.binary.load_from_path("out.ttnn")
 d = ttrt.binary.as_dict(fbb)
 ```
+
+## bonus
+artifacts are saved in ttrt-artifacts directory if the option `--save-artifacts` is provided
diff --git a/env/CMakeLists.txt b/env/CMakeLists.txt
index 6df12416aa..f19b60ee47 100644
--- a/env/CMakeLists.txt
+++ b/env/CMakeLists.txt
@@ -1,8 +1,8 @@
 cmake_minimum_required(VERSION 3.20.0)
 project(ttmlir-toolchain LANGUAGES CXX C)
 
-set(FLATBUFFERS_VERSION "v24.3.7")
-set(LLVM_PROJECT_VERSION "llvmorg-18.1.0")
+set(FLATBUFFERS_VERSION "fb9afbafc7dfe226b9db54d4923bfb8839635274")
+set(LLVM_PROJECT_VERSION "9ddfe62f5c11e3f65f444209f514029ded2d58b9")
 
 include(ExternalProject)
 
diff --git a/env/activate b/env/activate
index f21adff2fa..26de77394e 100644
--- a/env/activate
+++ b/env/activate
@@ -12,5 +12,7 @@ fi
 export TTMLIR_ENV_ACTIVATED=1
 export PATH=$TTMLIR_TOOLCHAIN_DIR/bin:$TTMLIR_TOOLCHAIN_DIR/venv/bin:$PATH
 export TT_METAL_HOME="$(pwd)/third_party/tt-metal/src/tt-metal"
-export PYTHONPATH="$(pwd)/build/python_packages:$(pwd)/.local/toolchain/python_packages/mlir_core"
+export TT_METAL_BUILD_HOME="$(pwd)/third_party/tt-metal/src/tt-metal-build"
+export TT_MLIR_HOME="$(pwd)"
+export PYTHONPATH="$(pwd)/build/python_packages:$(pwd)/.local/toolchain/python_packages/mlir_core:${TT_METAL_HOME}:${TT_METAL_HOME}/tt_eager:${TT_METAL_BUILD_HOME}/tools/profiler/bin"
 export ARCH_NAME="${ARCH_NAME:-wormhole_b0}"
diff --git a/include/ttmlir-c/TTAttrs.h b/include/ttmlir-c/TTAttrs.h
new file mode 100644
index 0000000000..750c201b09
--- /dev/null
+++ b/include/ttmlir-c/TTAttrs.h
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_C_TTATTRS_H
+#define TTMLIR_C_TTATTRS_H
+
+#include "mlir-c/AffineMap.h"
+#include "ttmlir-c/Dialects.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTGridAttrGet(MlirContext ctx,
+                                                     int64_t *shape,
+                                                     size_t shapeSize);
+
+MLIR_CAPI_EXPORTED MlirAttribute
+ttmlirTTChipCapabilityAttrGet(MlirContext ctx, uint32_t chipCapability);
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTArchAttrGet(MlirContext ctx,
+                                                     uint32_t arch);
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTChipDescAttrGet(
+    MlirContext ctx, MlirAttribute arch, int64_t *grid, size_t gridSize,
+    unsigned l1Size, unsigned numDramChannels, unsigned dramChannelSize,
+    unsigned nocL1AddressAlignBytes, unsigned pcieAddressAlignBytes,
+    unsigned nocDRAMAddressAlignBytes);
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTChipCoordAttrGet(
+    MlirContext ctx, unsigned rack, unsigned shelf, unsigned y, unsigned x);
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTChipChannelAttrGet(MlirContext ctx,
+                                                            unsigned endpoint0,
+                                                            unsigned endpoint1);
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTSystemDescAttrGet(
+    MlirContext ctx, MlirAttribute *chipDescs, size_t chipDescsSize,
+    unsigned *chipDescIndices, size_t chipDescIndicesSize,
+    MlirAttribute *chipCapabilities, size_t chipCapabilitiesSize,
+    MlirAttribute *chipCoords, size_t chipCoordsSize,
+    MlirAttribute *chipChannels, size_t chipChannelsSize);
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTLayoutAttrGet(MlirContext ctx,
+                                                       MlirAffineMap linear,
+                                                       unsigned oobVal,
+                                                       MlirAttribute grid,
+                                                       MlirType memref);
+
+MLIR_CAPI_EXPORTED MlirAttribute
+ttmlirTTMemorySpaceAttrGet(MlirContext ctx, uint32_t memorySpace);
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTOOBValAttrGet(MlirContext ctx,
+                                                       uint32_t oobVal);
+
+MLIR_CAPI_EXPORTED MlirAttribute
+ttmlirTTIteratorTypeAttrGet(MlirContext ctx, uint32_t iteratorType);
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTIteratorTypeArrayAttrGet(
+    MlirContext ctx, uint32_t *iteratorTypes, size_t iteratorTypesSize);
+
+MLIR_CAPI_EXPORTED MlirAttribute
+ttmlirTTOperandConstraintAttrGet(MlirContext ctx, uint32_t OperandConstraint);
+
+MLIR_CAPI_EXPORTED MlirAttribute ttmlirTTOperandConstraintArrayAttrGet(
+    MlirContext ctx, uint32_t *OperandConstraints,
+    size_t OperandConstraintsSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // TTMLIR_C_TTATTRS_H
diff --git a/include/ttmlir-c/TTTypes.h b/include/ttmlir-c/TTTypes.h
new file mode 100644
index 0000000000..8038a2d465
--- /dev/null
+++ b/include/ttmlir-c/TTTypes.h
@@ -0,0 +1,25 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_C_TTKERNELTYPES_H
+#define TTMLIR_C_TTKERNELTYPES_H
+
+#include "ttmlir-c/Dialects.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+MLIR_CAPI_EXPORTED MlirType ttmlirTTTileTypeGet(MlirContext ctx,
+                                                unsigned height, unsigned width,
+                                                uint32_t dataType);
+
+MLIR_CAPI_EXPORTED MlirType ttmlirTTDeviceTypeGet(MlirContext ctx,
+                                                  MlirAttribute deviceAttr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // TTMLIR_C_TTKERNELTYPES_H
diff --git a/include/ttmlir/Conversion/Passes.h b/include/ttmlir/Conversion/Passes.h
index 9ed050d512..bd4ee2753b 100644
--- a/include/ttmlir/Conversion/Passes.h
+++ b/include/ttmlir/Conversion/Passes.h
@@ -5,7 +5,10 @@
 #ifndef TTMLIR_CONVERSION_PASSES_H
 #define TTMLIR_CONVERSION_PASSES_H
 
+#include "ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h"
 #include "ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h"
+#include "ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIR.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNN.h"
 
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
diff --git a/include/ttmlir/Conversion/Passes.td b/include/ttmlir/Conversion/Passes.td
index 3d8e3bdcb8..42f2b267db 100644
--- a/include/ttmlir/Conversion/Passes.td
+++ b/include/ttmlir/Conversion/Passes.td
@@ -7,7 +7,19 @@
 
 include "mlir/Pass/PassBase.td"
 
-def ConvertTTNNToEmitC : Pass<"convert-ttnn-to-emitc", "::mlir::func::FuncOp"> {
+def ConvertTosaToTTIR : Pass<"convert-tosa-to-ttir", "::mlir::ModuleOp"> {
+  let summary = "Convert TOSA dialect to TTIR dialect.";
+  let constructor = "createConvertTosaToTTIRPass()";
+  let dependentDialects = ["mlir::tt::ttir::TTIRDialect"];
+}
+
+def ConvertTTIRToTTNN: Pass<"convert-ttir-to-ttnn", "::mlir::ModuleOp"> {
+  let summary = "Convert TTIR dialect to TTNN dialect.";
+  let constructor = "createConvertTTIRToTTNNPass()";
+  let dependentDialects = ["mlir::tt::ttir::TTIRDialect", "mlir::tt::ttnn::TTNNDialect"];
+}
+
+def ConvertTTNNToEmitC : Pass<"convert-ttnn-to-emitc", "::mlir::ModuleOp"> {
   let summary = "Convert TTNN dialect to EmitC dialect.";
   let constructor = "createConvertTTNNToEmitCPass()";
   let dependentDialects = ["mlir::emitc::EmitCDialect", "mlir::tt::ttnn::TTNNDialect"];
diff --git a/include/ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h b/include/ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h
new file mode 100644
index 0000000000..5163188047
--- /dev/null
+++ b/include/ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h
@@ -0,0 +1,20 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_CONVERSION_TTIRTOTTNN_TTIRTOTTNN_H
+#define TTMLIR_CONVERSION_TTIRTOTTNN_TTIRTOTTNN_H
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir::tt {
+
+void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
+                                TypeConverter &typeConverter);
+
+std::unique_ptr<OperationPass<ModuleOp>> createConvertTTIRToTTNNPass();
+
+} // namespace mlir::tt
+
+#endif // TTMLIR_CONVERSION_TTIRTOTTNN_TTIRTOTTNN_H
diff --git a/include/ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h b/include/ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h
index 8211d65ee2..9d412144ea 100644
--- a/include/ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h
+++ b/include/ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h
@@ -5,13 +5,15 @@
 #ifndef TTMLIR_CONVERSION_TTNNTOEMITC_TTNNTOEMITC_H
 #define TTMLIR_CONVERSION_TTNNTOEMITC_TTNNTOEMITC_H
 
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir::tt {
 
-std::unique_ptr<OperationPass<func::FuncOp>> createConvertTTNNToEmitCPass();
+void populateTTNNToEmitCPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
+                                 TypeConverter &typeConverter);
+
+std::unique_ptr<OperationPass<ModuleOp>> createConvertTTNNToEmitCPass();
 
 } // namespace mlir::tt
 
diff --git a/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h b/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h
new file mode 100644
index 0000000000..acd5373c90
--- /dev/null
+++ b/include/ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_CONVERSION_TOSATOTTIR_TOSATOTTIR_H
+#define TTMLIR_CONVERSION_TOSATOTTIR_TOSATOTTIR_H
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir::tt {
+
+std::unique_ptr<OperationPass<ModuleOp>> createConvertTosaToTTIRPass();
+
+} // namespace mlir::tt
+
+#endif
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
index 4ea14969f6..5b90a9a742 100644
--- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
+++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
@@ -28,4 +28,8 @@ inline bool isDeviceMemorySpace(MemorySpace memorySpace) {
 #define GET_TYPEDEF_CLASSES
 #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h.inc"
 
+namespace mlir::tt {
+DeviceAttr getCurrentScopeDevice(Operation *op);
+} // namespace mlir::tt
+
 #endif
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
index f2f0d52ce9..27a64d7861 100644
--- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
+++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
@@ -33,8 +33,8 @@ def TT_GridAttr : TT_Attr<"Grid", "grid"> {
   let parameters = (ins ArrayRefParameter<"int64_t">:$shape,
                         DefaultValuedParameter<
                           "AffineMap",
-                          "$_builder.getEmptyAffineMap()">:$physical_grid_mapping);
-  let assemblyFormat = "`<` custom<DimensionList>($shape) (`,` $physical_grid_mapping^)? `>`";
+                          "$_builder.getEmptyAffineMap()">:$mapping);
+  let assemblyFormat = "`<` custom<DimensionList>($shape) (`,` $mapping^)? `>`";
 
   let extraClassDeclaration = [{
       static GridAttr get(::mlir::MLIRContext *context) {
@@ -61,8 +61,8 @@ def TT_ChipDescAttr : TT_Attr<"ChipDesc", "chip_desc"> {
     TT chip_desc attribute
   }];
 
-  let parameters = (ins "ArchAttr":$arch, TT_GridAttr:$grid, "unsigned":$l1Size, "unsigned":$numDramChannels, "unsigned":$dramChannelSize, "unsigned":$nocL1AddressAlignBytes, "unsigned":$pcieAddressAlignBytes, "unsigned":$nocDRAMAddressAlignBytes);
-  let assemblyFormat = "`{` `arch` `=` $arch `,` `grid` `=` $grid `,` `l1_size` `=` $l1Size `,` `num_dram_channels` `=` $numDramChannels `,` `dram_channel_size` `=` $dramChannelSize `,` `noc_l1_address_align_bytes` `=` $nocL1AddressAlignBytes `,` `pcie_address_align_bytes` `=` $pcieAddressAlignBytes `,` `noc_dram_address_align_bytes` `=` $nocDRAMAddressAlignBytes `}`";
+  let parameters = (ins "ArchAttr":$arch, ArrayRefParameter<"int64_t">:$grid, "unsigned":$l1Size, "unsigned":$numDramChannels, "unsigned":$dramChannelSize, "unsigned":$nocL1AddressAlignBytes, "unsigned":$pcieAddressAlignBytes, "unsigned":$nocDRAMAddressAlignBytes);
+  let assemblyFormat = "`{` `arch` `=` $arch `,` `grid` `=` custom<DimensionList>($grid) `,` `l1_size` `=` $l1Size `,` `num_dram_channels` `=` $numDramChannels `,` `dram_channel_size` `=` $dramChannelSize `,` `noc_l1_address_align_bytes` `=` $nocL1AddressAlignBytes `,` `pcie_address_align_bytes` `=` $pcieAddressAlignBytes `,` `noc_dram_address_align_bytes` `=` $nocDRAMAddressAlignBytes `}`";
 }
 
 def TT_ChipCoordAttr : TT_Attr<"ChipCoord", "chip_coord"> {
@@ -100,6 +100,7 @@ def TT_SystemDescAttr : TT_Attr<"SystemDesc", "system_desc"> {
 
   let extraClassDeclaration = [{
     static tt::SystemDescAttr getDefault(MLIRContext *context);
+    static tt::SystemDescAttr getFromPath(MLIRContext *context, std::string& path);
   }];
 }
 
@@ -201,6 +202,24 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
   }];
 }
 
+def TT_DeviceAttr : TT_Attr<"Device", "device", []> {
+  let summary = "Device attribute in TT dialect";
+  let description = [{
+  }];
+  let parameters = (ins TT_GridAttr:$grid, ArrayRefParameter<"unsigned">:$chipIds);
+  let assemblyFormat = "`<` qualified($grid) `,` `[` $chipIds `]` `>`";
+
+  let extraClassDeclaration = [{
+      static DeviceAttr get(::mlir::MLIRContext *context, ArrayRef<int64_t> shape, AffineMap physicalGridMapping, ArrayRef<unsigned> chipIds) {
+        return DeviceAttr::get(context, GridAttr::get(context, shape, physicalGridMapping), chipIds);
+      }
+      static DeviceAttr get(::mlir::MLIRContext *context, SystemDescAttr systemDesc, ArrayRef<unsigned> chipIds);
+      static DeviceAttr get(::mlir::MLIRContext *context, SystemDescAttr systemDesc);
+  }];
+
+  let genVerifyDecl = 1;
+}
+
 def TT_MemorySpaceAttr : EnumAttr<TT_Dialect, TT_MemorySpace, "memory_space"> {
   let assemblyFormat = "`<` $value `>`";
 }
@@ -239,14 +258,15 @@ def TT_Tile : TT_Type<"Tile", "tile", [MemRefElementTypeInterface]> {
     let extraClassDeclaration = [{
       SmallVector<int64_t> getScalarShape(SmallVector<int64_t> tiledShape) const;
       SmallVector<int64_t> getTiledShape(SmallVector<int64_t> scalarShape) const;
+      uint64_t getSizeBytes() const;
     }];
 }
 
 def TT_Device : TT_Type<"Device", "device", []> {
     let summary = "TT device";
     let description = "Device type in TT dialect";
-    let parameters = (ins TT_GridAttr:$mesh, ArrayRefParameter<"unsigned">:$chipIds);
-    let assemblyFormat = "`<` qualified($mesh) `,` `[` $chipIds `]` `>`";
+    let parameters = (ins TT_DeviceAttr:$desc);
+    let assemblyFormat = "`<` $desc `>`";
 }
 
 #endif
diff --git a/include/ttmlir/Dialect/TTIR/Analysis/GridAnalysis.h b/include/ttmlir/Dialect/TTIR/Analysis/GridAnalysis.h
deleted file mode 100644
index 6a306d4de7..0000000000
--- a/include/ttmlir/Dialect/TTIR/Analysis/GridAnalysis.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TTMLIR_DIALECT_TTIR_ANALYSIS_GRIDANALYSIS_H
-#define TTMLIR_DIALECT_TTIR_ANALYSIS_GRIDANALYSIS_H
-
-#include "ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h"
-#include "llvm/ADT/StringMap.h"
-
-namespace mlir::tt::ttir {
-
-struct GridAnalysisResult {
-  int target_rows = 1;
-  int target_columns = 1;
-};
-
-struct GridAnalysisInput {
-  int max_supported_rows;
-  int max_supported_columns;
-  llvm::StringMap<SmallVector<int, 2>> *grid_size_overrides;
-
-  GridAnalysisInput()
-      : max_supported_rows(1), max_supported_columns(1),
-        grid_size_overrides(nullptr) {}
-
-  GridAnalysisInput(int max_supported_rows, int max_supported_columns,
-                    llvm::StringMap<SmallVector<int, 2>> *grid_size_overrides)
-      : max_supported_rows(max_supported_rows),
-        max_supported_columns(max_supported_columns),
-        grid_size_overrides(grid_size_overrides) {}
-
-  bool operator==(const GridAnalysisInput &rhs) const {
-    return max_supported_rows == rhs.max_supported_rows &&
-           max_supported_columns == rhs.max_supported_columns &&
-           grid_size_overrides == rhs.grid_size_overrides;
-  }
-
-  bool operator!=(const GridAnalysisInput &rhs) const {
-    return !(*this == rhs);
-  }
-};
-
-// Determine target grid size for each op.
-//
-class GridAnalysis
-    : public TTIRAnalysis<GridAnalysisInput, GridAnalysisResult> {
-
-private:
-  void analysisImplementation() override;
-  bool applyOverrides() override;
-
-public:
-  GridAnalysis(Operation *op) : TTIRAnalysis(op) {}
-};
-} // namespace mlir::tt::ttir
-
-#endif
diff --git a/include/ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h b/include/ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h
new file mode 100644
index 0000000000..b2ffede5ef
--- /dev/null
+++ b/include/ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_DIALECT_TTIR_ANALYSIS_LEGALGRIDANALYSIS_H
+#define TTMLIR_DIALECT_TTIR_ANALYSIS_LEGALGRIDANALYSIS_H
+
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace mlir::tt::ttir {
+
+struct LegalGridAnalysisInput {
+  ChipDescAttr chipDesc;
+  GridAttr maxGrid;
+  RankedTensorType tensorType;
+  llvm::StringMap<SmallVector<int64_t, 2>> *gridSizeOverrides;
+
+  LegalGridAnalysisInput()
+      : chipDesc(nullptr), maxGrid(nullptr), tensorType(nullptr),
+        gridSizeOverrides(nullptr) {}
+
+  LegalGridAnalysisInput(
+      ChipDescAttr chipDesc, GridAttr maxGrid, RankedTensorType tensorType,
+      llvm::StringMap<SmallVector<int64_t, 2>> *gridSizeOverrides)
+      : chipDesc(chipDesc), maxGrid(maxGrid), tensorType(tensorType),
+        gridSizeOverrides(gridSizeOverrides) {}
+
+  bool operator==(const LegalGridAnalysisInput &rhs) const {
+    return chipDesc == rhs.chipDesc && maxGrid == rhs.maxGrid &&
+           tensorType == rhs.tensorType &&
+           gridSizeOverrides == rhs.gridSizeOverrides;
+  }
+
+  bool operator!=(const LegalGridAnalysisInput &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+class LegalGridAnalysis
+    : public TTIRAnalysis<LegalGridAnalysisInput, std::vector<GridAttr>> {
+private:
+  void analysisImplementation() override;
+  bool applyOverrides() override;
+
+public:
+  LegalGridAnalysis(Operation *op) : TTIRAnalysis(op) {}
+};
+
+} // namespace mlir::tt::ttir
+
+#endif // TTMLIR_DIALECT_TTIR_ANALYSIS_LEGALGRIDANALYSIS_H
diff --git a/include/ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h b/include/ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h
new file mode 100644
index 0000000000..cfba2ca039
--- /dev/null
+++ b/include/ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_DIALECT_TTIR_ANALYSIS_OPTIMALTARGETGRIDANALYSIS_H
+#define TTMLIR_DIALECT_TTIR_ANALYSIS_OPTIMALTARGETGRIDANALYSIS_H
+
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h"
+
+namespace mlir::tt::ttir {
+
+struct OptimalTargetGridAnalysisInput {
+  llvm::DenseMap<Operation *, std::vector<GridAttr>> legalGrids;
+
+  OptimalTargetGridAnalysisInput() : legalGrids() {}
+
+  OptimalTargetGridAnalysisInput(
+      const llvm::DenseMap<Operation *, std::vector<GridAttr>> &&legalGrids)
+      : legalGrids(std::move(legalGrids)) {}
+
+  bool operator==(const OptimalTargetGridAnalysisInput &rhs) const {
+    return legalGrids == rhs.legalGrids;
+  }
+
+  bool operator!=(const OptimalTargetGridAnalysisInput &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+// Determine optimal target grid size for each op.
+//
+class OptimalTargetGridAnalysis
+    : public TTIRAnalysis<OptimalTargetGridAnalysisInput,
+                          llvm::DenseMap<Operation *, GridAttr>> {
+
+private:
+  void analysisImplementation() override;
+  bool applyOverrides() override;
+
+public:
+  OptimalTargetGridAnalysis(Operation *op) : TTIRAnalysis(op) {}
+};
+} // namespace mlir::tt::ttir
+
+#endif // TTMLIR_DIALECT_TTIR_ANALYSIS_OPTIMALTARGETGRIDANALYSIS_H
diff --git a/include/ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h b/include/ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h
index 95180f935b..1c0bb13f47 100644
--- a/include/ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h
+++ b/include/ttmlir/Dialect/TTIR/Analysis/TTIRAnalysis.h
@@ -13,9 +13,9 @@ namespace mlir::tt::ttir {
 template <class I, class R> class TTIRAnalysis {
 protected:
   Operation *op;
-  bool is_valid = false;
-  R analysis_result;
-  I analysis_input;
+  bool isValid = false;
+  R analysisResult;
+  I analysisInput;
 
   TTIRAnalysis(Operation *op) : op(op) {}
 
@@ -38,9 +38,9 @@ template <class I, class R> class TTIRAnalysis {
   void init(const I &input) {
     // Analysis can be cached and reused. Check that input remained the same.
     //
-    if (analysis_input != input) {
-      analysis_input = input;
-      is_valid = false;
+    if (analysisInput != input) {
+      analysisInput = input;
+      isValid = false;
     }
   }
 
@@ -48,7 +48,7 @@ template <class I, class R> class TTIRAnalysis {
   //
   const R &getResult() {
     runAnalysis();
-    return analysis_result;
+    return analysisResult;
   }
 
 private:
@@ -57,16 +57,16 @@ template <class I, class R> class TTIRAnalysis {
   void runAnalysis() {
     // Skip the analysis if it was already run and input params haven't changed.
     //
-    if (!is_valid) {
+    if (!isValid) {
       // Apply overrides if needed.
       //
-      bool skip_analysis = applyOverrides();
+      bool skipAnalysis = applyOverrides();
 
-      if (!skip_analysis) {
+      if (!skipAnalysis) {
         analysisImplementation();
       }
 
-      is_valid = true;
+      isValid = true;
     }
   }
 };
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
index 69e329bc07..99cae3ed29 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -60,10 +60,10 @@ def TTIR_GenericOp : TTIR_DPSOp<"generic", [AttrSizedOperandSegments]> {
     let regions = (region AnyRegion:$region);
 }
 
-def TTIR_LayoutOp : TTIR_Op<"layout", [DestinationStyleOpInterface]> {
-    let summary = "Layout op.";
+def TTIR_ToLayoutOp : TTIR_Op<"to_layout", [DestinationStyleOpInterface]> {
+    let summary = "ToLayout op.";
     let description = [{
-      Layout operation, transition tensors from one layout to another.  Some examples include:
+      ToLayout operation, transition tensors from one layout to another.  Some examples include:
         - Transitioning between different memory spaces, e.g. DRAM to L1.
         - Transitioning between different data types, e.g. f32 to f16.
         - Transitioning between different tile sizes, e.g. 1x16 to 32x32
@@ -73,7 +73,7 @@ def TTIR_LayoutOp : TTIR_Op<"layout", [DestinationStyleOpInterface]> {
       ```llvm
       #layout = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #system>>
       #layout1 = #tt.layout<8192x128x1, undef, <1x1>, memref<64x128xf32, #l1_>>
-      %1 = "ttir.layout"(%arg0, %0) : (tensor<64x128xf32, #layout>, tensor<64x128xf32, #layout1>) -> tensor<64x128xf32, #layout1>
+      %1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #layout>, tensor<64x128xf32, #layout1>) -> tensor<64x128xf32, #layout1>
       ```
     }];
 
@@ -217,6 +217,13 @@ def TTIR_SumOp : TTIR_ReductionOp<"sum"> {
     }];
 }
 
+def TTIR_MeanOp : TTIR_ReductionOp<"mean"> {
+  let summary = "Mean reduction op.";
+  let description = [{
+    Mean reduction op.
+  }];
+}
+
 def TTIR_SoftmaxOp : TTIR_DPSOp<"softmax"> {
     let summary = "Softmax operation.";
     let description = [{
@@ -225,7 +232,7 @@ def TTIR_SoftmaxOp : TTIR_DPSOp<"softmax"> {
 
     let arguments = (ins AnyRankedTensor:$input,
                          AnyRankedTensor:$output,
-                         I32Attr:$dimension,
+                         SI32Attr:$dimension,
                          TT_OperandConstraintArrayAttr:$operand_constraints);
 
     let results = (outs AnyRankedTensor:$result);
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h
index aad064211f..1d88e8a657 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.h
@@ -5,6 +5,7 @@
 #ifndef TTMLIR_DIALECT_TTIR_IR_TTIROPSINTERFACES_H
 #define TTMLIR_DIALECT_TTIR_IR_TTIROPSINTERFACES_H
 
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
 #include "ttmlir/Dialect/TTIR/IR/TTIR.h"
 
 namespace mlir {
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td
index 6b6d2dda1c..0c24b685ad 100644
--- a/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td
+++ b/include/ttmlir/Dialect/TTIR/IR/TTIROpsInterfaces.td
@@ -8,7 +8,7 @@
 include "mlir/IR/OpBase.td"
 include "ttmlir/Dialect/TT/IR/TTOpsTypes.td"
 
-def TTIROpInterface : OpInterface<"TTIROpInterface"> {
+def TTIROpInterface : OpInterface<"TTIROp"> {
   let cppNamespace = "::mlir::tt::ttir";
   let methods = [
     InterfaceMethod<
@@ -21,6 +21,16 @@ def TTIROpInterface : OpInterface<"TTIROpInterface"> {
       /*methodBody=*/"",
       /*defaultImplementation=*/""
     >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Get the device of the current scope.
+      }],
+      /*retTy=*/"::mlir::tt::DeviceAttr",
+      /*methodName=*/"getDevice",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/"return ::mlir::tt::getCurrentScopeDevice($_op);"
+    >,
   ];
 }
 
diff --git a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
index 987ce57c16..8c1391af64 100644
--- a/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
+++ b/include/ttmlir/Dialect/TTIR/Transforms/Passes.td
@@ -7,10 +7,11 @@
 
 include "mlir/Pass/PassBase.td"
 
-def ConvertTosaToTTIR: Pass<"convert-tosa-to-ttir", "::mlir::ModuleOp"> {
-  let summary = "";
+def TTIRImplicitDevice: Pass<"ttir-implicit-device", "::mlir::ModuleOp"> {
+  let summary = "Create an implicit device";
   let description = [{
-    Convert TOSA ops to TTIR ops.
+    This pass will take a view of the system descriptor and create an implicit
+    device around it.
   }];
 }
 
@@ -50,10 +51,21 @@ def TTIRGridSet: Pass<"ttir-grid-set", "::mlir::ModuleOp"> {
   }];
   let options = [
     Option<"overrideGridSizes", "override-grid-sizes",
-          "llvm::StringMap<SmallVector<int, 2>>",
-          /*default=*/"llvm::StringMap<SmallVector<int, 2>>()",
+          "llvm::StringMap<SmallVector<int64_t, 2>>",
+          /*default=*/"llvm::StringMap<SmallVector<int64_t, 2>>()",
            "Override grid sizes for specific ops.">,
   ];
 }
 
+def TTIRLoadSystemDesc: Pass<"ttir-load-system-desc", "::mlir::ModuleOp"> {
+  let summary = "Load system desc.";
+  let description = [{
+    Load system descriptor as a compiler pass.
+  }];
+
+  list<Option> options = [
+        Option<"path", "path", "std::string", "", "System desc path">,
+    ];
+}
+
 #endif
diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index 3fc48f0a08..3457dca6d2 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -124,6 +124,13 @@ def TTNN_SumOp : TTNN_ReductionOp<"sum"> {
     }];
 }
 
+def TTNN_MeanOp : TTNN_ReductionOp<"mean"> {
+  let summary = "Mean reduction op.";
+  let description = [{
+    Mean reduction op.
+  }];
+}
+
 def TTNN_ReluOp : TTNN_ElementwiseOp<"relu"> {
     let summary = "Eltwise ReLU.";
     let description = [{
@@ -139,7 +146,7 @@ def TTNN_SoftmaxOp : TTNN_NamedDPSOp<"softmax"> {
 
     let arguments = (ins AnyRankedTensor:$input,
                          AnyRankedTensor:$output,
-                         I32Attr: $dimension);
+                         SI32Attr: $dimension);
 
     let results = (outs AnyRankedTensor:$result);
 
diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h b/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h
index d15dd9ae85..141745a1dc 100644
--- a/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h
+++ b/include/ttmlir/Dialect/TTNN/Pipelines/Passes.h
@@ -9,13 +9,13 @@
 
 namespace mlir::tt::ttnn {
 struct GridSizeOverrideParser
-    : public llvm::cl::parser<llvm::StringMap<SmallVector<int, 2>>> {
+    : public llvm::cl::parser<llvm::StringMap<SmallVector<int64_t, 2>>> {
 public:
   GridSizeOverrideParser(llvm::cl::Option &opt)
-      : llvm::cl::parser<llvm::StringMap<SmallVector<int, 2>>>(opt) {}
+      : llvm::cl::parser<llvm::StringMap<SmallVector<int64_t, 2>>>(opt) {}
 
   bool parse(llvm::cl::Option &opt, StringRef argName, StringRef arg,
-             llvm::StringMap<SmallVector<int, 2>> &value) {
+             llvm::StringMap<SmallVector<int64_t, 2>> &value) {
     SmallVector<StringRef> overrideList;
     constexpr size_t kvPairSize = 2;
     constexpr size_t kMaxGridSize = 2;
@@ -29,11 +29,11 @@ struct GridSizeOverrideParser
         opt.error("Invalid format for override grid sizes: " + override);
         return true;
       }
-      SmallVector<int, kMaxGridSize> grid;
+      SmallVector<int64_t, kMaxGridSize> grid;
       SmallVector<StringRef, kMaxGridSize> gridParts;
       kv[iGrid].split(gridParts, 'x');
       for (const StringRef gridPart : gridParts) {
-        int gridValue;
+        int64_t gridValue;
         if (gridPart.getAsInteger(10 /*Radix*/, gridValue)) {
           opt.error("Invalid grid size: " + gridPart);
           return true;
@@ -46,7 +46,7 @@ struct GridSizeOverrideParser
   }
 
   static void print(llvm::raw_ostream &os,
-                    const llvm::StringMap<SmallVector<int, 2>> &value) {
+                    const llvm::StringMap<SmallVector<int64_t, 2>> &value) {
     os << "override-grid-sizes=";
     size_t count = 0;
     for (const auto &entry : value) {
@@ -82,10 +82,11 @@ struct TTIRToTTNNBackendPipelineOptions
   //
   // Note: This option is only valid if gridSetPassEnabled is true.
   //
-  Option<llvm::StringMap<SmallVector<int, 2>>, GridSizeOverrideParser>
-      overrideGridSizes{*this, "override-grid-sizes",
-                        llvm::cl::desc("Override grid sizes for specific ops."),
-                        llvm::cl::init(llvm::StringMap<SmallVector<int, 2>>())};
+  Option<llvm::StringMap<SmallVector<int64_t, 2>>, GridSizeOverrideParser>
+      overrideGridSizes{
+          *this, "override-grid-sizes",
+          llvm::cl::desc("Override grid sizes for specific ops."),
+          llvm::cl::init(llvm::StringMap<SmallVector<int64_t, 2>>())};
 };
 
 void createTTIRToTTNNBackendPipeline(
diff --git a/include/ttmlir/Dialect/TTNN/Transforms/Passes.td b/include/ttmlir/Dialect/TTNN/Transforms/Passes.td
index e004b5052e..a3433d68ba 100644
--- a/include/ttmlir/Dialect/TTNN/Transforms/Passes.td
+++ b/include/ttmlir/Dialect/TTNN/Transforms/Passes.td
@@ -14,22 +14,4 @@ def TTNNOpenDevice: Pass<"ttnn-open-device", "::mlir::ModuleOp"> {
   }];
 }
 
-def ConvertTTIRToTTNN: Pass<"convert-ttir-to-ttnn", "::mlir::ModuleOp"> {
-  let summary = "";
-  let description = [{
-    todo
-  }];
-}
-
-def TTNNSerializeToBinary: Pass<"ttnn-serialize-to-binary", "::mlir::ModuleOp"> {
-  let summary = "";
-  let description = [{
-    todo
-  }];
-
-  list<Option> options = [
-    Option<"output", "output", "std::string", "", "Output binary path">,
-  ];
-}
-
 #endif
diff --git a/include/ttmlir/Dialect/TTNN/Transforms/TTNNToSerializedBinary.h b/include/ttmlir/Dialect/TTNN/Transforms/TTNNToSerializedBinary.h
deleted file mode 100644
index b93e65f5f5..0000000000
--- a/include/ttmlir/Dialect/TTNN/Transforms/TTNNToSerializedBinary.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TTMLIR_DIALECT_TTNN_TRANSFORMS_TTNNTOSERIALIZEDBINARY_H
-#define TTMLIR_DIALECT_TTNN_TRANSFORMS_TTNNTOSERIALIZEDBINARY_H
-
-#include <memory>
-
-#include "mlir/IR/BuiltinOps.h"
-
-namespace mlir::tt::ttnn {
-std::shared_ptr<void> emitTTNNAsFlatbuffer(OwningOpRef<ModuleOp> &moduleOp);
-} // namespace mlir::tt::ttnn
-#endif
diff --git a/include/ttmlir/Target/Common/types.fbs b/include/ttmlir/Target/Common/types.fbs
index dd03a53c80..370aaaa4b8 100644
--- a/include/ttmlir/Target/Common/types.fbs
+++ b/include/ttmlir/Target/Common/types.fbs
@@ -62,7 +62,7 @@ table MemoryDesc {
 table LayoutDesc {
   stride: [int];
   oob_val: OOBVal;
-  grid: Dim2dRange;
+  core_range_set: [Dim2dRange];
   memory_desc: MemoryDesc;
 }
 
@@ -87,8 +87,8 @@ table TensorRef {
 
 table CBRef {
   global_id: uint32;
+  associated_tensor_global_id: uint32;
   address: uint64;
-  size: uint64;
   desc: CBDesc;
 }
 
@@ -126,3 +126,7 @@ table SystemDesc {
 table DeviceRef {
   global_id: uint32;
 }
+
+table EventRef {
+  global_id: uint32;
+}
diff --git a/include/ttmlir/Target/TTMetal/binary.fbs b/include/ttmlir/Target/TTMetal/binary.fbs
index 38bede9a69..99fe502a2f 100644
--- a/include/ttmlir/Target/TTMetal/binary.fbs
+++ b/include/ttmlir/Target/TTMetal/binary.fbs
@@ -4,13 +4,24 @@ include "command.fbs";
 
 namespace tt.target.metal;
 
+table DeviceProgram {
+  command_queues: [CommandQueue];
+}
+
+table Program {
+  name: string;
+  inputs: [TensorRef];
+  outputs: [TensorRef];
+  device_programs: [DeviceProgram];
+}
+
 table TTMetalBinary {
   version: Version;
   ttmlir_git_hash: string;
   system_desc: SystemDesc;
-  command_queues: [CommandQueue];
+  programs: [Program];
 }
 
 root_type TTMetalBinary;
-file_identifier "TTB0";
-file_extension "ttb";
+file_identifier "TTM0";
+file_extension "ttm";
diff --git a/include/ttmlir/Target/TTMetal/command.fbs b/include/ttmlir/Target/TTMetal/command.fbs
index 0c70fe03be..e3fd9cc0b1 100644
--- a/include/ttmlir/Target/TTMetal/command.fbs
+++ b/include/ttmlir/Target/TTMetal/command.fbs
@@ -3,39 +3,64 @@ include "program.fbs";
 namespace tt.target.metal;
 
 
-table DispatchCommand {
+table EnqueueProgramCommand {
   operands: [TensorRef];
-  programs: [DispatchProgram];
+  program: ProgramDesc;
 }
 
-table HostWriteCommand {
+table EnqueueWriteBufferCommand {
   src: TensorRef;
   dst: TensorRef;
 }
 
-table HostReadCommand {
+table EnqueueReadBufferCommand {
   src: TensorRef;
   dst: TensorRef;
 }
 
-table HostAllocCommand {
-  src: TensorRef;
+table CreateBufferCommand {
+  ref: TensorRef;
 }
 
-table HostDeallocCommand {
-  src: TensorRef;
+table DeallocateBufferCommand {
+  ref: TensorRef;
+}
+
+table CreateEventCommand {
+  ref: EventRef;
+}
+
+table EnqueueRecordEventCommand {
+  ref: EventRef;
+}
+
+table EnqueueWaitForEventCommand {
+  ref: EventRef;
+}
+
+table EventSynchronizeCommand {
+  ref: EventRef;
+}
+
+table EventQueryCommand {
+  ref: EventRef;
 }
 
 table FinishCommand {
 }
 
 union CommandType {
-  dispatch: DispatchCommand,
-  host_write: HostWriteCommand,
-  host_read: HostReadCommand,
-  host_alloc: HostAllocCommand,
-  host_dealloc: HostDeallocCommand,
-  finish: FinishCommand,
+  EnqueueProgramCommand,
+  EnqueueWriteBufferCommand,
+  EnqueueReadBufferCommand,
+  CreateBufferCommand,
+  DeallocateBufferCommand,
+  CreateEventCommand,
+  EnqueueRecordEventCommand,
+  EnqueueWaitForEventCommand,
+  EventSynchronizeCommand,
+  EventQueryCommand,
+  FinishCommand,
 }
 
 table Command {
@@ -45,7 +70,5 @@ table Command {
 
 table CommandQueue {
   name: string;
-  inputs: [TensorRef];
-  outputs: [TensorRef];
   commands: [Command];
 }
diff --git a/include/ttmlir/Target/TTMetal/program.fbs b/include/ttmlir/Target/TTMetal/program.fbs
index ee5b0eaf27..9b81d52857 100644
--- a/include/ttmlir/Target/TTMetal/program.fbs
+++ b/include/ttmlir/Target/TTMetal/program.fbs
@@ -36,11 +36,11 @@ union Kernel {
 
 table KernelDesc {
   kernel: Kernel;
-  core_range: Dim2dRange;
+  core_range_set: [Dim2dRange];
   cbs: [CBRef];
   debug_info: string;
 }
 
-table DispatchProgram {
+table ProgramDesc {
   kernels: [KernelDesc];
 }
diff --git a/include/ttmlir/Target/TTNN/TTNNToFlatbuffer.h b/include/ttmlir/Target/TTNN/TTNNToFlatbuffer.h
new file mode 100644
index 0000000000..202a6a7745
--- /dev/null
+++ b/include/ttmlir/Target/TTNN/TTNNToFlatbuffer.h
@@ -0,0 +1,22 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_TARGET_UTILS_TTNNTOFLATBUFFER_H
+#define TTMLIR_TARGET_UTILS_TTNNTOFLATBUFFER_H
+
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir::tt::ttnn {
+
+// Convert a TTNNIR operation to a flatbuffer
+std::shared_ptr<void> ttnnToFlatbuffer(Operation *op);
+
+// Convert a TTNNIR operation to a flatbuffer
+// This function signature is required in order to register the conversion in
+// mlir translation framework
+LogicalResult translateTTNNToFlatbuffer(Operation *op, llvm::raw_ostream &os);
+} // namespace mlir::tt::ttnn
+
+#endif
diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index a2251c1cb9..b9ab192426 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -40,6 +40,7 @@ table EltwiseOp {
 
 enum ReductionOpType: uint32 {
   Sum = 0,
+  Mean = 1,
 }
 
 table ReductionOp {
diff --git a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
index a37dde9427..e4f041c910 100644
--- a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
+++ b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
@@ -8,9 +8,11 @@
 #include <type_traits>
 
 #include "flatbuffers/flatbuffers.h"
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
 #include "ttmlir/Target/Common/debug_info_generated.h"
 #include "ttmlir/Target/Common/types_generated.h"
 #include "ttmlir/Target/Utils/FlatbufferObjectCache.h"
+#include "ttmlir/Utils.h"
 
 namespace mlir::tt {
 inline ::tt::target::OOBVal toFlatbuffer(FlatbufferObjectCache &,
@@ -29,6 +31,28 @@ inline ::tt::target::OOBVal toFlatbuffer(FlatbufferObjectCache &,
   }
 }
 
+inline std::uint64_t getElementSizeBytes(DataType dtype) {
+  switch (dtype) {
+  case DataType::Float32:
+    return 4;
+  case DataType::Float16:
+    return 2;
+  case DataType::BFloat16:
+    return 2;
+  case DataType::UInt32:
+    return 4;
+  case DataType::UInt16:
+    return 2;
+  case DataType::UInt8:
+    return 1;
+  default:
+    assert(false && "unsupported data type");
+    break;
+  }
+  assert(false && "unsupported data type");
+  return 0;
+}
+
 inline ::tt::target::DataType toFlatbuffer(FlatbufferObjectCache &,
                                            DataType dtype) {
   switch (dtype) {
@@ -126,17 +150,6 @@ T toFlatbuffer(FlatbufferObjectCache &, T arith) {
   return arith;
 }
 
-inline flatbuffers::Offset<::tt::target::ChipDesc>
-toFlatbuffer(FlatbufferObjectCache &cache, ChipDescAttr chipDesc) {
-  auto grid = toFlatbuffer(cache, chipDesc.getGrid());
-  return ::tt::target::CreateChipDesc(
-      *cache.fbb, toFlatbuffer(cache, chipDesc.getArch()), &grid,
-      chipDesc.getL1Size(), chipDesc.getNumDramChannels(),
-      chipDesc.getDramChannelSize(), chipDesc.getNocL1AddressAlignBytes(),
-      chipDesc.getPcieAddressAlignBytes(),
-      chipDesc.getNocDRAMAddressAlignBytes());
-}
-
 template <typename T>
 using ToFlatbufferReturnType = decltype(toFlatbuffer(
     std::declval<FlatbufferObjectCache &>(), std::declval<T>()));
@@ -174,6 +187,18 @@ toFlatbuffer(FlatbufferObjectCache &cache, ::llvm::ArrayRef<T> arr) {
       [&cache, arr](size_t i) { return toFlatbuffer(cache, arr[i]); });
 }
 
+inline flatbuffers::Offset<::tt::target::ChipDesc>
+toFlatbuffer(FlatbufferObjectCache &cache, ChipDescAttr chipDesc) {
+  assert(chipDesc.getGrid().size() == 2 && "expected a 2D grid");
+  auto grid = ::tt::target::Dim2d(chipDesc.getGrid()[0], chipDesc.getGrid()[1]);
+  return ::tt::target::CreateChipDesc(
+      *cache.fbb, toFlatbuffer(cache, chipDesc.getArch()), &grid,
+      chipDesc.getL1Size(), chipDesc.getNumDramChannels(),
+      chipDesc.getDramChannelSize(), chipDesc.getNocL1AddressAlignBytes(),
+      chipDesc.getPcieAddressAlignBytes(),
+      chipDesc.getNocDRAMAddressAlignBytes());
+}
+
 inline flatbuffers::Offset<::tt::target::SystemDesc>
 toFlatbuffer(FlatbufferObjectCache &cache, SystemDescAttr systemDesc) {
   auto chipDescs = toFlatbuffer(cache, systemDesc.getChipDescs());
@@ -186,10 +211,54 @@ toFlatbuffer(FlatbufferObjectCache &cache, SystemDescAttr systemDesc) {
                                         chipChannels);
 }
 
+inline std::vector<::tt::target::Dim2dRange>
+toFlatbuffer(FlatbufferObjectCache &cache, GridAttr tensorGrid,
+             GridAttr deviceGrid) {
+  std::vector<::tt::target::Dim2dRange> coreRangeSet;
+  SmallVector<std::int64_t> tensorGridShape(tensorGrid.getShape());
+  AffineMap mapping = deviceGrid.getMapping();
+  ::ttmlir::utils::sample(
+      tensorGridShape, [&](SmallVector<std::int64_t> const &virtualCoreCoord) {
+        SmallVector<std::int64_t> coreCoord = mapping.compose(virtualCoreCoord);
+        assert(coreCoord.size() == 3 && "expected a 2D core");
+        assert(coreCoord[0] == 0 && "expected single device");
+        if (!coreRangeSet.empty() &&
+            ((coreRangeSet.back().loc().y() == coreCoord[1]) &&
+             (coreRangeSet.back().loc().x() + coreRangeSet.back().size().x()) ==
+                 coreCoord[2])) {
+          coreRangeSet.back() = ::tt::target::Dim2dRange(
+              coreRangeSet.back().loc(),
+              ::tt::target::Dim2d(coreRangeSet.back().size().y(),
+                                  coreRangeSet.back().size().x() + 1));
+        } else {
+          coreRangeSet.push_back(::tt::target::Dim2dRange(
+              ::tt::target::Dim2d(coreCoord[1], coreCoord[2]),
+              ::tt::target::Dim2d(1, 1)));
+        }
+        if (coreRangeSet.size() > 1 &&
+            (coreRangeSet[coreRangeSet.size() - 2].loc().x() ==
+             coreRangeSet.back().loc().x()) &&
+            (coreRangeSet[coreRangeSet.size() - 2].size().x() ==
+             coreRangeSet.back().size().x()) &&
+            ((coreRangeSet[coreRangeSet.size() - 2].loc().y() +
+              coreRangeSet[coreRangeSet.size() - 2].size().y()) ==
+             coreRangeSet.back().loc().y())) {
+          assert(coreRangeSet.back().size().y() == 1);
+          coreRangeSet[coreRangeSet.size() - 2] = ::tt::target::Dim2dRange(
+              coreRangeSet[coreRangeSet.size() - 2].loc(),
+              ::tt::target::Dim2d(
+                  coreRangeSet[coreRangeSet.size() - 2].size().y() + 1,
+                  coreRangeSet[coreRangeSet.size() - 2].size().x()));
+          coreRangeSet.pop_back();
+        }
+      });
+  return coreRangeSet;
+}
+
 inline DataType elementTypeToDataType(Type elementType) {
   DataType dtype = DataType::Float32;
   if (isa<FloatType>(elementType)) {
-    auto floatType = elementType.cast<FloatType>();
+    auto floatType = mlir::cast<FloatType>(elementType);
     if (floatType.isF32()) {
       dtype = DataType::Float32;
     } else if (floatType.isF16()) {
@@ -200,7 +269,7 @@ inline DataType elementTypeToDataType(Type elementType) {
       assert(false && "unsupported float type");
     }
   } else if (isa<IntegerType>(elementType)) {
-    auto intType = elementType.cast<IntegerType>();
+    auto intType = mlir::cast<IntegerType>(elementType);
     if (intType.getWidth() == 32) {
       dtype = DataType::UInt32;
     } else if (intType.getWidth() == 16) {
@@ -256,56 +325,67 @@ memrefAttrToFlatbuffer(FlatbufferObjectCache &cache, MemRefType memref) {
   auto shapeInt64 = memref.getShape();
   std::vector<int32_t> shape(shapeInt64.begin(), shapeInt64.end());
   DataType dtype = DataType::Float32;
-  ::tt::target::Dim2d tileShape(0, 0);
+  ::tt::target::Dim2d tileShape(1, 1);
   Type elementType = memref.getElementType();
+  std::uint64_t elementSize = 0;
   if (isa<TileType>(elementType)) {
-    auto tileType = elementType.cast<TileType>();
+    auto tileType = mlir::cast<TileType>(elementType);
     dtype = tileType.getDataType();
     tileShape = ::tt::target::Dim2d(tileType.getHeight(), tileType.getWidth());
+    elementSize = tileType.getSizeBytes();
   } else {
     dtype = elementTypeToDataType(elementType);
+    elementSize = getElementSizeBytes(dtype);
+  }
+  std::uint64_t size = elementSize;
+  for (auto dim : shapeInt64) {
+    size *= dim;
   }
 
   return ::tt::target::CreateMemoryDescDirect(
       *cache.fbb, &shape, &tileShape, toFlatbuffer(cache, dtype),
-      toFlatbuffer(cache,
-                   memref.getMemorySpace().cast<MemorySpaceAttr>().getValue()));
+      toFlatbuffer(
+          cache,
+          mlir::cast<MemorySpaceAttr>(memref.getMemorySpace()).getValue()),
+      size);
 }
 
 inline flatbuffers::Offset<::tt::target::LayoutDesc>
 layoutAttrToFlatbuffer(FlatbufferObjectCache &cache, Attribute attr,
-                       ArrayRef<int64_t> logicalShape) {
-  assert(attr.isa<LayoutAttr>() && "expected a tensor type");
-  auto layoutAttr = attr.cast<LayoutAttr>();
+                       ArrayRef<int64_t> logicalShape, DeviceAttr deviceAttr) {
+  assert(isa<LayoutAttr>(attr) && "expected a tensor type");
+  auto layoutAttr = mlir::cast<LayoutAttr>(attr);
   auto strideInt64 = layoutAttr.getStride(logicalShape);
   std::vector<int32_t> stride(strideInt64.begin(), strideInt64.end());
-  auto gridAttr = layoutAttr.getGrid();
-  auto gridShape = gridAttr.getShape();
-  assert(gridShape.size() == 2 && "expected a 2D grid");
-  ::tt::target::Dim2dRange grid(
-      ::tt::target::Dim2d(0, 0),
-      ::tt::target::Dim2d(gridShape[0], gridShape[1]));
+  auto coreRangeSet =
+      toFlatbuffer(cache, layoutAttr.getGrid(), deviceAttr.getGrid());
   return ::tt::target::CreateLayoutDescDirect(
-      *cache.fbb, &stride, toFlatbuffer(cache, layoutAttr.getOobVal()), &grid,
+      *cache.fbb, &stride, toFlatbuffer(cache, layoutAttr.getOobVal()),
+      &coreRangeSet,
       cache.getOrCreate(layoutAttr.getMemref(), memrefAttrToFlatbuffer));
 }
 
 inline flatbuffers::Offset<::tt::target::TensorDesc>
-tensorTypeToFlatbuffer(FlatbufferObjectCache &cache, Type type) {
-  auto tensorType = type.cast<RankedTensorType>();
+tensorTypeToFlatbuffer(FlatbufferObjectCache &cache, Type type,
+                       DeviceAttr deviceAttr) {
+  auto tensorType = mlir::cast<RankedTensorType>(type);
   auto shapeInt64 = tensorType.getShape();
   std::vector<int32_t> shape(shapeInt64.begin(), shapeInt64.end());
   return ::tt::target::CreateTensorDescDirect(
       *cache.fbb, &shape,
       cache.getOrCreate(tensorType.getEncoding(), layoutAttrToFlatbuffer,
-                        shapeInt64));
+                        shapeInt64, deviceAttr));
 }
 
 inline flatbuffers::Offset<::tt::target::TensorRef>
 tensorValueToFlatbuffer(FlatbufferObjectCache &cache, Value value,
                         uint64_t address, uint64_t size) {
-  auto tensorType = value.getType().cast<RankedTensorType>();
-  auto tensorDesc = cache.getOrCreate(tensorType, tensorTypeToFlatbuffer);
+  auto deviceAttr =
+      getCurrentScopeDevice(value.getParentBlock()->getParentOp());
+  assert(deviceAttr);
+  auto tensorType = mlir::cast<RankedTensorType>(value.getType());
+  auto tensorDesc =
+      cache.getOrCreate(tensorType, tensorTypeToFlatbuffer, deviceAttr);
   return ::tt::target::CreateTensorRef(*cache.fbb, cache.global_id++, address,
                                        size, tensorDesc);
 }
diff --git a/include/ttmlir/Utils.h b/include/ttmlir/Utils.h
new file mode 100644
index 0000000000..79297978e3
--- /dev/null
+++ b/include/ttmlir/Utils.h
@@ -0,0 +1,32 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_UTILS_H
+#define TTMLIR_UTILS_H
+
+#include <cstdint>
+
+namespace ttmlir::utils {
+
+template <typename Vector, typename Fn>
+inline void sample(Vector const &shape, Fn fn) {
+  Vector strides(shape.size());
+  std::int64_t stride = 1;
+  for (std::int64_t i = shape.size() - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= shape[i];
+  }
+
+  Vector index(shape.size());
+  int64_t volume = stride;
+  for (int64_t i = 0; i < volume; ++i) {
+    for (unsigned j = 0; j < shape.size(); ++j) {
+      index[j] = (i / strides[j]) % shape[j];
+    }
+    fn(index);
+  }
+}
+} // namespace ttmlir::utils
+
+#endif
diff --git a/lib/CAPI/CMakeLists.txt b/lib/CAPI/CMakeLists.txt
index 6c66709450..d3c6752b56 100644
--- a/lib/CAPI/CMakeLists.txt
+++ b/lib/CAPI/CMakeLists.txt
@@ -1,6 +1,8 @@
 add_mlir_public_c_api_library(TTMLIRCAPI
   Dialects.cpp
   TTKernelTypes.cpp
+  TTAttrs.cpp
+  TTTypes.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/include/ttmlir-c/
@@ -9,6 +11,7 @@ add_mlir_public_c_api_library(TTMLIRCAPI
 
   LINK_LIBS PUBLIC
   MLIRIR
+  MLIRCAPITransforms
   MLIRSupport
   MLIRTTDialect
   MLIRTTIRDialect
diff --git a/lib/CAPI/TTAttrs.cpp b/lib/CAPI/TTAttrs.cpp
new file mode 100644
index 0000000000..e17bcfb31e
--- /dev/null
+++ b/lib/CAPI/TTAttrs.cpp
@@ -0,0 +1,157 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir-c/TTAttrs.h"
+#include "mlir/CAPI/IR.h"
+#include "mlir/CAPI/Support.h"
+
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+
+namespace mlir::tt {
+
+MlirAttribute ttmlirTTGridAttrGet(MlirContext ctx, int64_t *shape,
+                                  int shapeSize) {
+  return wrap(GridAttr::get(unwrap(ctx), {shape, shape + shapeSize},
+                            AffineMap::get(unwrap(ctx))));
+}
+
+MlirAttribute ttmlirTTChipCapabilityAttrGet(MlirContext ctx,
+                                            uint32_t chipCapability) {
+  return wrap(ChipCapabilityAttr::get(
+      unwrap(ctx), static_cast<ChipCapability>(chipCapability)));
+}
+
+MlirAttribute ttmlirTTArchAttrGet(MlirContext ctx, uint32_t arch) {
+  return wrap(ArchAttr::get(unwrap(ctx), static_cast<Arch>(arch)));
+}
+
+MlirAttribute ttmlirTTChipDescAttrGet(MlirContext ctx, MlirAttribute arch,
+                                      int64_t *grid, size_t gridSize,
+                                      unsigned l1Size, unsigned numDramChannels,
+                                      unsigned dramChannelSize,
+                                      unsigned nocL1AddressAlignBytes,
+                                      unsigned pcieAddressAlignBytes,
+                                      unsigned nocDRAMAddressAlignBytes) {
+  std::vector<int64_t> gridVec(grid, grid + gridSize);
+  return wrap(ChipDescAttr::get(
+      unwrap(ctx), mlir::dyn_cast<ArchAttr>(unwrap(arch)), gridVec, l1Size,
+      numDramChannels, dramChannelSize, nocL1AddressAlignBytes,
+      pcieAddressAlignBytes, nocDRAMAddressAlignBytes));
+}
+
+MlirAttribute ttmlirTTChipCoordAttrGet(MlirContext ctx, unsigned rack,
+                                       unsigned shelf, unsigned y, unsigned x) {
+  return wrap(ChipCoordAttr::get(unwrap(ctx), rack, shelf, y, x));
+}
+
+MlirAttribute ttmlirTTChipChannelAttrGet(MlirContext ctx, unsigned endpoint0,
+                                         unsigned endpoint1) {
+  return wrap(ChipChannelAttr::get(unwrap(ctx), endpoint0, endpoint1));
+}
+
+MlirAttribute ttmlirTTSystemDescAttrGet(
+    MlirContext ctx, MlirAttribute *chipDescs, size_t chipDescsSize,
+    unsigned *chipDescIndices, size_t chipDescIndicesSize,
+    MlirAttribute *chipCapabilities, size_t chipCapabilitiesSize,
+    MlirAttribute *chipCoords, size_t chipCoordsSize,
+    MlirAttribute *chipChannels, size_t chipChannelsSize) {
+  llvm::ArrayRef<MlirAttribute> chipDescsRef(chipDescs, chipDescsSize),
+      chipCapabilitiesRef(chipCapabilities, chipCapabilitiesSize),
+      chipCoordsRef(chipCoords, chipCoordsSize),
+      chipChannelsRef(chipChannels, chipChannelsSize);
+  llvm::ArrayRef<unsigned> chipDescIndicesRef(chipDescIndices,
+                                              chipDescIndicesSize);
+
+  // Unwrap all of the MlirAttributes
+  std::vector<tt::ChipDescAttr> chipDescsUnwrapped;
+  for (auto chipDesc : chipDescsRef) {
+    chipDescsUnwrapped.push_back(mlir::cast<ChipDescAttr>(unwrap(chipDesc)));
+  }
+
+  std::vector<tt::ChipCapabilityAttr> chipCapabilitiesUnwrapped;
+  for (auto chipCapability : chipCapabilitiesRef) {
+    chipCapabilitiesUnwrapped.push_back(
+        mlir::cast<ChipCapabilityAttr>(unwrap(chipCapability)));
+  }
+
+  std::vector<tt::ChipCoordAttr> chipCoordsUnwrapped;
+  for (auto chipCoord : chipCoordsRef) {
+    chipCoordsUnwrapped.push_back(mlir::cast<ChipCoordAttr>(unwrap(chipCoord)));
+  }
+
+  std::vector<tt::ChipChannelAttr> chipChannelsUnwrapped;
+  for (auto chipChannel : chipChannelsRef) {
+    chipChannelsUnwrapped.push_back(
+        mlir::cast<ChipChannelAttr>(unwrap(chipChannel)));
+  }
+
+  return wrap(SystemDescAttr::get(unwrap(ctx), chipDescsUnwrapped,
+                                  chipDescIndicesRef, chipCapabilitiesUnwrapped,
+                                  chipCoordsUnwrapped, chipChannelsUnwrapped));
+}
+
+MlirAttribute ttmlirTTLayoutAttrGet(MlirContext ctx, MlirAffineMap linear,
+                                    unsigned oobVal, MlirAttribute grid,
+                                    MlirType memref) {
+  mlir::AffineMap affineMap = mlir::AffineMap::getFromOpaquePointer(linear.ptr);
+  return wrap(LayoutAttr::get(unwrap(ctx), affineMap,
+                              static_cast<OOBVal>(oobVal),
+                              mlir::cast<GridAttr>(unwrap(grid)),
+                              mlir::cast<MemRefType>(unwrap(memref))));
+}
+
+MlirAttribute ttmlirTTMemorySpaceAttrGet(MlirContext ctx,
+                                         uint32_t memorySpace) {
+  return wrap(MemorySpaceAttr::get(unwrap(ctx),
+                                   static_cast<tt::MemorySpace>(memorySpace)));
+}
+
+MlirAttribute ttmlirTTOOBValAttrGet(MlirContext ctx, uint32_t oobVal) {
+  return wrap(OOBValAttr::get(unwrap(ctx), static_cast<tt::OOBVal>(oobVal)));
+}
+
+MlirAttribute ttmlirTTIteratorTypeAttrGet(MlirContext ctx,
+                                          uint32_t iteratorType) {
+  return wrap(IteratorTypeAttr::get(
+      unwrap(ctx), static_cast<tt::IteratorType>(iteratorType)));
+}
+
+MlirAttribute ttmlirTTIteratorTypeArrayAttrGet(MlirContext ctx,
+                                               uint32_t *iteratorTypes,
+                                               size_t iteratorTypesSize) {
+  std::vector<uint32_t> iteratorTypesEnumArray(
+      iteratorTypes, iteratorTypes + iteratorTypesSize);
+  std::vector<mlir::Attribute> iteratorTypesArray;
+
+  for (auto iteratorEnum : iteratorTypesEnumArray) {
+    iteratorTypesArray.push_back(IteratorTypeAttr::get(
+        unwrap(ctx), static_cast<tt::IteratorType>(iteratorEnum)));
+  }
+
+  return wrap(ArrayAttr::get(unwrap(ctx), iteratorTypesArray));
+}
+
+MlirAttribute ttmlirTTOperandConstraintAttrGet(MlirContext ctx,
+                                               uint32_t operandConstraint) {
+  return wrap(OperandConstraintAttr::get(
+      unwrap(ctx), static_cast<tt::OperandConstraint>(operandConstraint)));
+}
+
+MlirAttribute
+ttmlirTTOperandConstraintArrayAttrGet(MlirContext ctx,
+                                      uint32_t *operandConstraints,
+                                      size_t operandConstraintsSize) {
+  std::vector<uint32_t> operandConstraintsEnumArray(
+      operandConstraints, operandConstraints + operandConstraintsSize);
+  std::vector<mlir::Attribute> operandConstraintsArray;
+
+  for (auto operandEnum : operandConstraintsEnumArray) {
+    operandConstraintsArray.push_back(OperandConstraintAttr::get(
+        unwrap(ctx), static_cast<tt::OperandConstraint>(operandEnum)));
+  }
+
+  return wrap(ArrayAttr::get(unwrap(ctx), operandConstraintsArray));
+}
+
+} // namespace mlir::tt
diff --git a/lib/CAPI/TTKernelTypes.cpp b/lib/CAPI/TTKernelTypes.cpp
index e68796e214..92e1b46e91 100644
--- a/lib/CAPI/TTKernelTypes.cpp
+++ b/lib/CAPI/TTKernelTypes.cpp
@@ -13,5 +13,5 @@ using namespace mlir::tt::ttkernel;
 MlirType ttmlirTTKernelCBTypeGet(MlirContext ctx, uint64_t address,
                                  uint64_t port, MlirType memrefType) {
   return wrap(CBType::get(unwrap(ctx), address, port,
-                          unwrap(memrefType).cast<mlir::MemRefType>()));
+                          mlir::cast<mlir::MemRefType>(unwrap(memrefType))));
 }
diff --git a/lib/CAPI/TTTypes.cpp b/lib/CAPI/TTTypes.cpp
new file mode 100644
index 0000000000..df6151f818
--- /dev/null
+++ b/lib/CAPI/TTTypes.cpp
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mlir/CAPI/IR.h"
+#include "mlir/CAPI/Support.h"
+#include "ttmlir-c/TTAttrs.h"
+
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+
+namespace mlir::tt {
+
+MlirType ttmlirTTTileTypeGet(MlirContext ctx, unsigned height, unsigned width,
+                             uint32_t dataType) {
+  return wrap(TileType::get(unwrap(ctx), height, width,
+                            static_cast<tt::DataType>(dataType)));
+}
+
+MlirType ttmlirTTDeviceTypeGet(MlirContext ctx, MlirAttribute deviceAttr) {
+  return wrap(DeviceType::get(unwrap(ctx),
+                              mlir::cast<tt::DeviceAttr>(unwrap(deviceAttr))));
+}
+
+} // namespace mlir::tt
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 9c2e8a35a8..0f0c5707af 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(CAPI)
 add_subdirectory(Conversion)
 add_subdirectory(Dialect)
+add_subdirectory(Target)
 
 add_mlir_library(TTMLIR STATIC RegisterAll.cpp
   LINK_LIBS PUBLIC
@@ -8,6 +9,7 @@ add_mlir_library(TTMLIR STATIC RegisterAll.cpp
     MLIRTTDialect
     MLIRTTIRDialect
     MLIRTTIRTransforms
+    TTMLIRConversions
     MLIRTTIRAnalysis
     MLIRTTNNDialect
     MLIRTTNNTransforms
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 510e214c5a..c46bd1e68f 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -1 +1,11 @@
+add_subdirectory(TosaToTTIR)
 add_subdirectory(TTNNToEmitC)
+add_subdirectory(TTIRToTTNN)
+
+add_library(TTMLIRConversions INTERFACE)
+
+target_link_libraries(TTMLIRConversions INTERFACE
+  TTMLIRTosaToTTIR
+  TTMLIRTTNNToEmitC
+  TTMLIRTTIRToTTNN
+)
diff --git a/lib/Conversion/PassDetail.h b/lib/Conversion/PassDetail.h
deleted file mode 100644
index 47c670cd9d..0000000000
--- a/lib/Conversion/PassDetail.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TTMLIR_LIB_CONVERSION_PASSDETAIL_H
-#define TTMLIR_LIB_CONVERSION_PASSDETAIL_H
-
-#include "ttmlir/Dialect/TTNN/IR/TTNN.h"
-
-#include "mlir/Dialect/EmitC/IR/EmitC.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir::emitc {
-class EmitCDialect;
-}
-
-namespace mlir::tt::ttnn {
-
-#define GEN_PASS_CLASSES
-#include "ttmlir/Conversion/Passes.h.inc"
-
-} // namespace mlir::tt::ttnn
-
-#endif // TTMLIR_LIB_CONVERSION_PASSDETAIL_H
diff --git a/lib/Conversion/TTIRToTTNN/CMakeLists.txt b/lib/Conversion/TTIRToTTNN/CMakeLists.txt
new file mode 100644
index 0000000000..c9a897e975
--- /dev/null
+++ b/lib/Conversion/TTIRToTTNN/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_mlir_library(TTMLIRTTIRToTTNN
+  TTIRToTTNN.cpp
+  TTIRToTTNNPass.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/include/ttmlir/Conversion/TTIRToTTNN
+
+  DEPENDS
+  TTMLIRConversionPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIR
+)
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
new file mode 100644
index 0000000000..831c00c7cd
--- /dev/null
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNN.cpp
@@ -0,0 +1,155 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h"
+
+#include "ttmlir/Dialect/TTIR/IR/TTIROps.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+using namespace mlir::tt;
+
+namespace {
+
+static Value findDevice(Operation *op) {
+  Block *block = op->getBlock();
+  for (auto &op : block->getOperations()) {
+    if (auto deviceOp = dyn_cast<ttnn::OpenDeviceOp>(op)) {
+      return deviceOp.getResult();
+    }
+  }
+  assert(false && "No device found");
+  return nullptr;
+}
+
+class TensorEmptyToFullConversionPattern
+    : public OpConversionPattern<tensor::EmptyOp> {
+public:
+  using OpConversionPattern<tensor::EmptyOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(tensor::EmptyOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto device = findDevice(op);
+    rewriter.replaceOpWithNewOp<ttnn::FullOp>(
+        op, this->getTypeConverter()->convertType(op.getType()), device,
+        rewriter.getF32FloatAttr(0.0));
+    return success();
+  }
+};
+
+class ToLayoutOpConversionPattern
+    : public OpConversionPattern<ttir::ToLayoutOp> {
+public:
+  using OpConversionPattern<ttir::ToLayoutOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::ToLayoutOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<ttnn::ToMemoryConfigOp>(
+        op, this->getTypeConverter()->convertType(op.getType()),
+        adaptor.getInput(), adaptor.getOutput());
+    return success();
+  }
+};
+
+template <typename TTIROpTy, typename TTNNOpTy,
+          typename OpAdaptor = typename TTIROpTy::Adaptor>
+class ElementwiseBinaryOpConversionPattern
+    : public OpConversionPattern<TTIROpTy> {
+public:
+  using OpConversionPattern<TTIROpTy>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(TTIROpTy op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    SmallVector<Type> resultTypes;
+    if (failed(this->getTypeConverter()->convertTypes(op->getResultTypes(),
+                                                      resultTypes))) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<TTNNOpTy>(op, resultTypes, adaptor.getInputs(),
+                                          adaptor.getOutputs());
+    return success();
+  }
+};
+
+template <typename TTIROpTy, typename TTNNOpTy,
+          typename OpAdaptor = typename TTIROpTy::Adaptor>
+class ReductionOpConversionPattern : public OpConversionPattern<TTIROpTy> {
+public:
+  using OpConversionPattern<TTIROpTy>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(TTIROpTy op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<TTNNOpTy>(
+        op, this->getTypeConverter()->convertType(op.getType()),
+        adaptor.getInput(), adaptor.getOutput(), adaptor.getKeepDim(),
+        adaptor.getDimArg().value_or(nullptr));
+    return success();
+  }
+};
+
+class SoftmaxOpConversionPattern : public OpConversionPattern<ttir::SoftmaxOp> {
+public:
+  using OpConversionPattern<ttir::SoftmaxOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::SoftmaxOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<ttnn::SoftmaxOp>(
+        op, this->getTypeConverter()->convertType(op.getType()),
+        adaptor.getInput(), adaptor.getOutput(), adaptor.getDimension());
+    return success();
+  }
+};
+
+} // namespace
+
+// ANCHOR: adding_an_op_matmul_op_rewriter
+class MatmulOpConversionPattern : public OpConversionPattern<ttir::MatmulOp> {
+public:
+  using OpConversionPattern<ttir::MatmulOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttir::MatmulOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<ttnn::MatmulOp>(
+        op, this->getTypeConverter()->convertType(op.getType()), adaptor.getA(),
+        adaptor.getB(), adaptor.getOutput());
+    return success();
+  }
+};
+// ANCHOR_END: adding_an_op_matmul_op_rewriter
+
+namespace mlir::tt {
+
+void populateTTIRToTTNNPatterns(MLIRContext *ctx, RewritePatternSet &patterns,
+                                TypeConverter &typeConverter) {
+  // clang-format off
+  // ANCHOR: adding_an_op_matmul_rewrite_pattern_set
+  patterns
+      .add<TensorEmptyToFullConversionPattern,
+           ToLayoutOpConversionPattern,
+           ElementwiseBinaryOpConversionPattern<ttir::AddOp, ttnn::AddOp>,
+           ElementwiseBinaryOpConversionPattern<ttir::SubtractOp, ttnn::SubtractOp>,
+           ElementwiseBinaryOpConversionPattern<ttir::MultiplyOp, ttnn::MultiplyOp>,
+           ElementwiseBinaryOpConversionPattern<ttir::GreaterEqualOp, ttnn::GreaterEqualOp>,
+           ElementwiseBinaryOpConversionPattern<ttir::ReluOp, ttnn::ReluOp>,
+           ReductionOpConversionPattern<ttir::SumOp, ttnn::SumOp>,
+           ReductionOpConversionPattern<ttir::MeanOp, ttnn::MeanOp>,
+           SoftmaxOpConversionPattern,
+           MatmulOpConversionPattern
+           >(typeConverter, ctx);
+  // ANCHOR_END: adding_an_op_matmul_rewrite_pattern_set
+  // clang-format on
+}
+
+} // namespace mlir::tt
diff --git a/lib/Conversion/TTIRToTTNN/TTIRToTTNNPass.cpp b/lib/Conversion/TTIRToTTNN/TTIRToTTNNPass.cpp
new file mode 100644
index 0000000000..90662295f7
--- /dev/null
+++ b/lib/Conversion/TTIRToTTNN/TTIRToTTNNPass.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h"
+
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIR.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNN.h"
+
+using namespace mlir;
+using namespace mlir::tt;
+
+namespace mlir::tt::ttir {
+
+#define GEN_PASS_DEF_CONVERTTTIRTOTTNN
+#include "ttmlir/Conversion/Passes.h.inc"
+
+} // namespace mlir::tt::ttir
+
+namespace {
+
+struct ConvertTTIRToTTNNPass
+    : public ttir::impl::ConvertTTIRToTTNNBase<ConvertTTIRToTTNNPass> {
+  void runOnOperation() final {
+    mlir::ConversionTarget target(getContext());
+    target.addLegalDialect<ttnn::TTNNDialect>();
+    target.addIllegalDialect<ttir::TTIRDialect>();
+
+    TypeConverter typeConverter;
+    // All types map 1:1.
+    typeConverter.addConversion([](Type type) { return type; });
+
+    RewritePatternSet patterns(&getContext());
+    populateTTIRToTTNNPatterns(&getContext(), patterns, typeConverter);
+
+    // Full conversion requires explicit handling of FuncOp and ModuleOp, which
+    // should be passed down unmodified so partial conversion is used.
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+} // namespace
+
+namespace mlir::tt {
+
+std::unique_ptr<OperationPass<ModuleOp>> createConvertTTIRToTTNNPass() {
+  return std::make_unique<ConvertTTIRToTTNNPass>();
+}
+
+} // namespace mlir::tt
diff --git a/lib/Conversion/TTNNToEmitC/CMakeLists.txt b/lib/Conversion/TTNNToEmitC/CMakeLists.txt
index 126d859167..ec17f22cfb 100644
--- a/lib/Conversion/TTNNToEmitC/CMakeLists.txt
+++ b/lib/Conversion/TTNNToEmitC/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(TTMLIRTTNNToEmitC
   TTNNToEmitC.cpp
+  TTNNToEmitCPass.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/include/ttmlir/Conversion/TTNNToEmitC
@@ -8,7 +9,5 @@ add_mlir_library(TTMLIRTTNNToEmitC
   TTMLIRConversionPassIncGen
 
   LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRPass
-  MLIRTransformUtils
+  MLIR
 )
diff --git a/lib/Conversion/TTNNToEmitC/PopulatePatterns.h b/lib/Conversion/TTNNToEmitC/PopulatePatterns.h
deleted file mode 100644
index 167e757279..0000000000
--- a/lib/Conversion/TTNNToEmitC/PopulatePatterns.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TTMLIR_LIB_CONVERSION_TTNNTOEMITC_POPULATEPATTERNS_H
-#define TTMLIR_LIB_CONVERSION_TTNNTOEMITC_POPULATEPATTERNS_H
-
-#include "mlir/Dialect/EmitC/IR/EmitC.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-using namespace mlir;
-
-namespace {
-
-template <typename SrcOp, typename Adaptor = typename SrcOp::Adaptor>
-class DefaultOpConversionPattern : public OpConversionPattern<SrcOp> {
-  using OpConversionPattern<SrcOp>::OpConversionPattern;
-
-public:
-  // Default op conversion pattern, used to convert most ops
-  //
-  DefaultOpConversionPattern(MLIRContext *ctx)
-      : OpConversionPattern<SrcOp>(ctx) {}
-
-  DefaultOpConversionPattern(const TypeConverter &typeConverter,
-                             MLIRContext *context, PatternBenefit benefit = 1)
-      : OpConversionPattern<SrcOp>(typeConverter, context, benefit) {}
-
-  // Converts op name by removing the dialect prefix ("ttnn.") and replacing
-  // with namespace prefix ("ttnn::")
-  //
-  std::string convertOpName(SrcOp op) const {
-    auto name = op.getOperationName();
-    assert(
-        name.starts_with("ttnn.") &&
-        "DefaultOpConversionPattern only supports ops from the TTNN dialect");
-
-    return name.str().replace(0, 5, "ttnn::");
-  }
-
-  LogicalResult
-  matchAndRewrite(SrcOp srcOp, Adaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    int numReturnTypes = srcOp->getResultTypes().size();
-    assert(numReturnTypes <= 1 &&
-           "DefaultOpConversionPattern does not support multiple return types");
-
-    // If srcOp has a return type, cast it before converting
-    //
-    if (numReturnTypes == 1) {
-      auto resultTy = cast<emitc::OpaqueType>(
-          this->getTypeConverter()->convertType(srcOp->getResult(0).getType()));
-      rewriter.replaceOpWithNewOp<emitc::CallOpaqueOp>(
-          srcOp, resultTy, convertOpName(srcOp), nullptr, nullptr,
-          adaptor.getOperands());
-    } else {
-      // No return type, only convert the op
-      //
-      rewriter.replaceOpWithNewOp<emitc::CallOpaqueOp>(
-          srcOp, srcOp->getResultTypes(), convertOpName(srcOp), nullptr,
-          nullptr, adaptor.getOperands());
-    }
-
-    return success();
-  }
-};
-
-} // namespace
-
-#endif // TTMLIR_LIB_CONVERSION_TTNNTOEMITC_POPULATEPATTERNS_H
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index 491ab78953..ddeb81633b 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -4,10 +4,6 @@
 
 #include "ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h"
 
-#include "../PassDetail.h"
-#include "PopulatePatterns.h"
-#include "TypeConverter.h"
-
 #include "ttmlir/Dialect/TT/IR/TTOpsDialect.h.inc"
 #include "ttmlir/Dialect/TTNN/IR/TTNN.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
@@ -28,6 +24,60 @@ using namespace mlir::tt;
 
 namespace {
 
+// Default op conversion pattern, used to convert most ops
+//
+template <typename SrcOp, typename Adaptor = typename SrcOp::Adaptor>
+class DefaultOpConversionPattern : public OpConversionPattern<SrcOp> {
+  using OpConversionPattern<SrcOp>::OpConversionPattern;
+
+public:
+  DefaultOpConversionPattern(const TypeConverter &typeConverter,
+                             MLIRContext *context, PatternBenefit benefit = 1)
+      : OpConversionPattern<SrcOp>(typeConverter, context, benefit) {}
+
+  // Converts op name by removing the dialect prefix ("ttnn.") and replacing
+  // with namespace prefix ("ttnn::")
+  //
+  std::string convertOpName(SrcOp op) const {
+    auto name = op.getOperationName();
+    assert(
+        name.starts_with("ttnn.") &&
+        "DefaultOpConversionPattern only supports ops from the TTNN dialect");
+
+    return name.str().replace(0, 5, "ttnn::");
+  }
+
+  LogicalResult
+  matchAndRewrite(SrcOp srcOp, Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    int numReturnTypes = srcOp->getResultTypes().size();
+    assert(numReturnTypes <= 1 &&
+           "DefaultOpConversionPattern does not support multiple return types");
+
+    // If srcOp has a return type, cast it before converting
+    //
+    if (numReturnTypes == 1) {
+      auto resultTy = cast<emitc::OpaqueType>(
+          this->getTypeConverter()->convertType(srcOp->getResult(0).getType()));
+      rewriter.replaceOpWithNewOp<emitc::CallOpaqueOp>(
+          srcOp, resultTy, convertOpName(srcOp), nullptr, nullptr,
+          adaptor.getOperands());
+    } else {
+      // No return type, only convert the op
+      //
+      rewriter.replaceOpWithNewOp<emitc::CallOpaqueOp>(
+          srcOp, srcOp->getResultTypes(), convertOpName(srcOp), nullptr,
+          nullptr, adaptor.getOperands());
+    }
+
+    return success();
+  }
+};
+
+} // namespace
+
+namespace mlir::tt {
+
 void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
                                  mlir::RewritePatternSet &patterns,
                                  TypeConverter &typeConverter) {
@@ -47,97 +97,29 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
   //
   patterns.add<DefaultOpConversionPattern<ttnn::FullOp>>(typeConverter, ctx);
 
-  // Math ops
+  // Eltwise unary ops
+  //
+  patterns.add<DefaultOpConversionPattern<ttnn::ReluOp>>(typeConverter, ctx);
+  patterns.add<DefaultOpConversionPattern<ttnn::SoftmaxOp>>(typeConverter, ctx);
+
+  // Eltwise binary ops
   //
   patterns.add<DefaultOpConversionPattern<ttnn::AddOp>>(typeConverter, ctx);
   patterns.add<DefaultOpConversionPattern<ttnn::SubtractOp>>(typeConverter,
                                                              ctx);
-  patterns.add<DefaultOpConversionPattern<ttnn::GreaterEqualOp>>(typeConverter,
-                                                                 ctx);
-  patterns.add<DefaultOpConversionPattern<ttnn::SumOp>>(typeConverter, ctx);
-  patterns.add<DefaultOpConversionPattern<ttnn::SoftmaxOp>>(typeConverter, ctx);
   patterns.add<DefaultOpConversionPattern<ttnn::MultiplyOp>>(typeConverter,
                                                              ctx);
-  patterns.add<DefaultOpConversionPattern<ttnn::MatmulOp>>(typeConverter, ctx);
-  patterns.add<DefaultOpConversionPattern<ttnn::ReluOp>>(typeConverter, ctx);
-}
-
-struct ConvertTTNNToEmitCPass
-    : public ttnn::ConvertTTNNToEmitCBase<ConvertTTNNToEmitCPass> {
-  void runOnOperation() override {
-    mlir::ConversionTarget target(getContext());
-
-    target.addLegalDialect<func::FuncDialect>();
-    target.addLegalDialect<emitc::EmitCDialect>();
-    target.addIllegalDialect<ttnn::TTNNDialect>();
-
-    // Add header imports to front of module
-    //
-    {
-      auto module = getOperation();
-      OpBuilder builder(module);
-
-      builder.create<emitc::IncludeOp>(module.getLoc(), "ttnn/device.h",
-                                       /*isStandard=*/false);
-      builder.create<emitc::IncludeOp>(
-          module.getLoc(), "ttnn/operations/eltwise/binary/binary.hpp",
-          /*isStandard=*/false);
-      builder.create<emitc::IncludeOp>(
-          module.getLoc(), "ttnn/operations/core.hpp", /*isStandard=*/false);
-      builder.create<emitc::IncludeOp>(module.getLoc(),
-                                       "ttnn/operations/creation.hpp",
-                                       /*isStandard=*/false);
-      builder.create<emitc::IncludeOp>(
-          module.getLoc(),
-          "ttnn/operations/reduction/generic/generic_reductions.hpp",
-          /*isStandard=*/false);
-      builder.create<emitc::IncludeOp>(module.getLoc(),
-                                       "ttnn/operations/normalization.hpp",
-                                       /*isStandard=*/false);
-    }
-
-    // TTNN -> EmitC
-    //
-    {
-      TTNNToEmitCTypeConverter typeConverter(&getContext());
-      RewritePatternSet patterns(&getContext());
-
-      // Func dialect handling
-      //
-      populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
-          patterns, typeConverter);
-      target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
-        return typeConverter.isSignatureLegal(op.getFunctionType()) &&
-               typeConverter.isLegal(&op.getBody());
-      });
-      populateReturnOpTypeConversionPattern(patterns, typeConverter);
-      target.addDynamicallyLegalOp<func::ReturnOp>(
-          [&](func::ReturnOp op) { return typeConverter.isLegal(op); });
-      populateCallOpTypeConversionPattern(patterns, typeConverter);
-      target.addDynamicallyLegalOp<func::CallOp>(
-          [&](func::CallOp op) { return typeConverter.isLegal(op); });
-
-      // TTNN -> EmitC patterns
-      //
-      populateTTNNToEmitCPatterns(&getContext(), patterns, typeConverter);
-
-      // Apply conversion
-      //
-      if (failed(applyFullConversion(getOperation(), target,
-                                     std::move(patterns)))) {
-        signalPassFailure();
-        return;
-      }
-    }
-  };
-};
-
-} // namespace
+  patterns.add<DefaultOpConversionPattern<ttnn::GreaterEqualOp>>(typeConverter,
+                                                                 ctx);
 
-namespace mlir::tt {
+  // Matmul ops
+  //
+  patterns.add<DefaultOpConversionPattern<ttnn::MatmulOp>>(typeConverter, ctx);
 
-std::unique_ptr<OperationPass<func::FuncOp>> createConvertTTNNToEmitCPass() {
-  return std::make_unique<ConvertTTNNToEmitCPass>();
+  // Reduction ops
+  //
+  patterns.add<DefaultOpConversionPattern<ttnn::SumOp>>(typeConverter, ctx);
+  patterns.add<DefaultOpConversionPattern<ttnn::MeanOp>>(typeConverter, ctx);
 }
 
 } // namespace mlir::tt
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp
new file mode 100644
index 0000000000..98d54801e5
--- /dev/null
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitCPass.cpp
@@ -0,0 +1,121 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Conversion/TTNNToEmitC/TTNNToEmitC.h"
+
+#include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "ttmlir/Dialect/TTNN/IR/TTNN.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+
+using namespace mlir;
+using namespace mlir::tt;
+
+namespace mlir::tt::ttnn {
+
+#define GEN_PASS_DEF_CONVERTTTNNTOEMITC
+#include "ttmlir/Conversion/Passes.h.inc"
+
+} // namespace mlir::tt::ttnn
+
+namespace {
+
+class TTNNToEmitCTypeConverter : public TypeConverter {
+public:
+  TTNNToEmitCTypeConverter(MLIRContext *ctx) {
+    addConversion([](Type type) { return type; });
+    addConversion([ctx](mlir::tt::DeviceType type) -> Type {
+      return emitc::OpaqueType::get(ctx, "ttnn::Device");
+    });
+    addConversion([ctx](TensorType type) -> Type {
+      return emitc::OpaqueType::get(ctx, "ttnn::Tensor");
+    });
+  }
+};
+
+struct ConvertTTNNToEmitCPass
+    : public ttnn::impl::ConvertTTNNToEmitCBase<ConvertTTNNToEmitCPass> {
+  void runOnOperation() override {
+    mlir::ConversionTarget target(getContext());
+
+    target.addLegalDialect<emitc::EmitCDialect>();
+    target.addIllegalDialect<ttnn::TTNNDialect>();
+    target.addLegalOp<mlir::ModuleOp>();
+
+    // Add header imports to front of module
+    //
+    {
+      auto module = getOperation();
+      OpBuilder builder(module);
+
+      builder.create<emitc::IncludeOp>(module.getLoc(), "ttnn/device.h",
+                                       /*isStandard=*/false);
+      builder.create<emitc::IncludeOp>(
+          module.getLoc(), "ttnn/operations/eltwise/binary/binary.hpp",
+          /*isStandard=*/false);
+      builder.create<emitc::IncludeOp>(
+          module.getLoc(), "ttnn/operations/core.hpp", /*isStandard=*/false);
+      builder.create<emitc::IncludeOp>(module.getLoc(),
+                                       "ttnn/operations/creation.hpp",
+                                       /*isStandard=*/false);
+      builder.create<emitc::IncludeOp>(
+          module.getLoc(),
+          "ttnn/operations/reduction/generic/generic_reductions.hpp",
+          /*isStandard=*/false);
+      builder.create<emitc::IncludeOp>(module.getLoc(),
+                                       "ttnn/operations/normalization.hpp",
+                                       /*isStandard=*/false);
+    }
+
+    // TTNN -> EmitC
+    //
+    {
+      TTNNToEmitCTypeConverter typeConverter(&getContext());
+      RewritePatternSet patterns(&getContext());
+
+      // Func dialect handling
+      //
+      populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
+          patterns, typeConverter);
+      target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
+        return typeConverter.isSignatureLegal(op.getFunctionType()) &&
+               typeConverter.isLegal(&op.getBody());
+      });
+      populateReturnOpTypeConversionPattern(patterns, typeConverter);
+      target.addDynamicallyLegalOp<func::ReturnOp>(
+          [&](func::ReturnOp op) { return typeConverter.isLegal(op); });
+      populateCallOpTypeConversionPattern(patterns, typeConverter);
+      target.addDynamicallyLegalOp<func::CallOp>(
+          [&](func::CallOp op) { return typeConverter.isLegal(op); });
+
+      // TTNN -> EmitC patterns
+      //
+      populateTTNNToEmitCPatterns(&getContext(), patterns, typeConverter);
+
+      // Apply conversion
+      //
+      if (failed(applyFullConversion(getOperation(), target,
+                                     std::move(patterns)))) {
+        signalPassFailure();
+        return;
+      }
+    }
+  };
+};
+
+} // namespace
+
+namespace mlir::tt {
+
+std::unique_ptr<OperationPass<ModuleOp>> createConvertTTNNToEmitCPass() {
+  return std::make_unique<ConvertTTNNToEmitCPass>();
+}
+
+} // namespace mlir::tt
diff --git a/lib/Conversion/TTNNToEmitC/TypeConverter.h b/lib/Conversion/TTNNToEmitC/TypeConverter.h
deleted file mode 100644
index e6d4f896d5..0000000000
--- a/lib/Conversion/TTNNToEmitC/TypeConverter.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TTMLIR_LIB_CONVERSION_TTNNTOEMITC_TYPECONVERTER_H
-#define TTMLIR_LIB_CONVERSION_TTNNTOEMITC_TYPECONVERTER_H
-
-#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
-
-#include "mlir/Dialect/EmitC/IR/EmitC.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-using namespace mlir;
-
-namespace {
-
-class TTNNToEmitCTypeConverter : public TypeConverter {
-public:
-  TTNNToEmitCTypeConverter(MLIRContext *ctx) {
-    addConversion([](Type type) { return type; });
-    addConversion([ctx](mlir::tt::DeviceType type) -> Type {
-      return emitc::OpaqueType::get(ctx, "ttnn::Device");
-    });
-    addConversion([ctx](TensorType type) -> Type {
-      return emitc::OpaqueType::get(ctx, "ttnn::Tensor");
-    });
-  }
-};
-
-} // namespace
-
-#endif // TTMLIR_LIB_CONVERSION_TTNNTOEMITC_TYPECONVERTER_H
diff --git a/lib/Conversion/TosaToTTIR/CMakeLists.txt b/lib/Conversion/TosaToTTIR/CMakeLists.txt
new file mode 100644
index 0000000000..d6b9341386
--- /dev/null
+++ b/lib/Conversion/TosaToTTIR/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_mlir_library(TTMLIRTosaToTTIR
+  TosaToTTIR.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/include/ttmlir/Conversion/TosaToTTIR
+
+  DEPENDS
+  TTMLIRConversionPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIR
+)
diff --git a/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp b/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp
new file mode 100644
index 0000000000..fc97f9771c
--- /dev/null
+++ b/lib/Conversion/TosaToTTIR/TosaToTTIR.cpp
@@ -0,0 +1,117 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Conversion/TosaToTTIR/TosaToTTIR.h"
+#include "ttmlir/Dialect/TT/IR/TT.h"
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIR.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIROps.h"
+
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/Tensor/IR/Tensor.h>
+#include <mlir/IR/BuiltinAttributes.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/IR/ValueRange.h>
+#include <mlir/Support/LogicalResult.h>
+
+using namespace mlir;
+using namespace tt;
+
+namespace mlir::tt::ttir {
+
+#define GEN_PASS_DEF_CONVERTTOSATOTTIR
+#include "ttmlir/Conversion/Passes.h.inc"
+
+} // namespace mlir::tt::ttir
+
+namespace {
+
+template <typename SrcOp, typename DestOp,
+          typename Adaptor = typename SrcOp::Adaptor>
+class TosaToTTIROpConversionPattern : public OpConversionPattern<SrcOp> {
+  using OpConversionPattern<SrcOp>::OpConversionPattern;
+
+public:
+  LogicalResult
+  matchAndRewrite(SrcOp srcOp, Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if constexpr (std::is_same<SrcOp, tosa::MulOp>::value) {
+      assert(srcOp.getShift() == 0);
+    }
+
+    auto outputType = mlir::cast<RankedTensorType>(srcOp.getResult().getType());
+    auto outputTensor = rewriter.create<tensor::EmptyOp>(
+        srcOp.getLoc(), outputType.getShape(), outputType.getElementType());
+    rewriter.replaceOpWithNewOp<DestOp>(
+        srcOp, TypeRange(outputTensor.getType()), adaptor.getOperands(),
+        ValueRange(outputTensor),
+        rewriter.getArrayAttr(
+            SmallVector<Attribute>(adaptor.getOperands().size() + 1,
+                                   rewriter.getAttr<OperandConstraintAttr>(
+                                       OperandConstraint::AnyDeviceTile))));
+    return success();
+  }
+};
+
+struct ConvertTosaToTTIRPass
+    : public ttir::impl::ConvertTosaToTTIRBase<ConvertTosaToTTIRPass> {
+  void runOnOperation() override {
+    mlir::ConversionTarget target(getContext());
+
+    target.addIllegalDialect<tosa::TosaDialect>();
+
+    target.addLegalDialect<ttir::TTIRDialect>();
+    target.addLegalOp<mlir::tensor::EmptyOp>();
+    target.addLegalOp<mlir::ModuleOp>();
+    target.addLegalOp<mlir::func::FuncOp>();
+    target.addLegalOp<mlir::func::ReturnOp>();
+
+    // For now keep the same type assuming tosa ops operate on builtin tensor.
+    TypeConverter typeConverter;
+    typeConverter.addConversion([](Type type) {
+      assert(isa<RankedTensorType>(type) &&
+             "only ranked tensor type supported");
+      return type;
+    });
+    RewritePatternSet patterns(&getContext());
+
+    // Add conversion patterns.
+    patterns
+        .add<TosaToTTIROpConversionPattern<tosa::AddOp, mlir::tt::ttir::AddOp>>(
+            typeConverter, &getContext());
+    patterns.add<
+        TosaToTTIROpConversionPattern<tosa::MulOp, mlir::tt::ttir::MultiplyOp>>(
+        typeConverter, &getContext());
+    patterns.add<
+        TosaToTTIROpConversionPattern<tosa::SubOp, mlir::tt::ttir::SubtractOp>>(
+        typeConverter, &getContext());
+    patterns.add<TosaToTTIROpConversionPattern<tosa::GreaterEqualOp,
+                                               mlir::tt::ttir::GreaterEqualOp>>(
+        typeConverter, &getContext());
+
+    // Apply conversion.
+    if (failed(
+            applyFullConversion(getOperation(), target, std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+  }
+};
+
+} // namespace
+
+namespace mlir::tt {
+
+std::unique_ptr<OperationPass<ModuleOp>> createConvertTosaToTTIRPass() {
+  return std::make_unique<ConvertTosaToTTIRPass>();
+}
+
+} // namespace mlir::tt
diff --git a/lib/Dialect/TT/IR/TTDialect.cpp b/lib/Dialect/TT/IR/TTDialect.cpp
index 1b300d5cf1..7f073b191c 100644
--- a/lib/Dialect/TT/IR/TTDialect.cpp
+++ b/lib/Dialect/TT/IR/TTDialect.cpp
@@ -24,15 +24,15 @@ struct TTOpAsmDialectInterface : public OpAsmDialectInterface {
       return AliasResult::OverridableAlias;
     }
     if (llvm::isa<MemorySpaceAttr>(attr)) {
-      os << attr.template cast<MemorySpaceAttr>().getValue();
+      os << mlir::cast<MemorySpaceAttr>(attr).getValue();
       return AliasResult::OverridableAlias;
     }
     if (llvm::isa<IteratorTypeAttr>(attr)) {
-      os << attr.template cast<IteratorTypeAttr>().getValue();
+      os << mlir::cast<IteratorTypeAttr>(attr).getValue();
       return AliasResult::OverridableAlias;
     }
     if (llvm::isa<OperandConstraintAttr>(attr)) {
-      auto value = attr.template cast<OperandConstraintAttr>().getValue();
+      auto value = mlir::cast<OperandConstraintAttr>(attr).getValue();
       if (value == OperandConstraint::Any) {
         os << "any";
       } else if (value == OperandConstraint::AnyDevice) {
diff --git a/lib/Dialect/TT/IR/TTOpsTypes.cpp b/lib/Dialect/TT/IR/TTOpsTypes.cpp
index a3453b5d82..d54bf6881e 100644
--- a/lib/Dialect/TT/IR/TTOpsTypes.cpp
+++ b/lib/Dialect/TT/IR/TTOpsTypes.cpp
@@ -2,11 +2,15 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <fstream>
+#include <numeric>
+
 #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
 
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "ttmlir/Dialect/TT/IR/TT.h"
+#include "ttmlir/Target/Common/system_desc_generated.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
@@ -25,9 +29,8 @@ mlir::tt::SystemDescAttr::getDefault(MLIRContext *context) {
       // Chip Descriptors
       {
           tt::ChipDescAttr::get(
-              context, tt::ArchAttr::get(context, tt::Arch::WormholeB0),
-              tt::GridAttr::get(context, {8, 8}), (1 << 20), 12, (1 << 20), 16,
-              32, 32),
+              context, tt::ArchAttr::get(context, tt::Arch::WormholeB0), {8, 8},
+              (1 << 20), 12, (1 << 20), 16, 32, 32),
       },
       // Chip Descriptor Indices
       {
@@ -48,17 +51,93 @@ mlir::tt::SystemDescAttr::getDefault(MLIRContext *context) {
       {});
 }
 
+mlir::tt::SystemDescAttr
+mlir::tt::SystemDescAttr::getFromPath(MLIRContext *context, std::string &path) {
+  // Check if file exists
+  assert(!path.empty() && "cluster desc path must not be empty!");
+  std::ifstream fbb(path, std::ios::binary | std::ios::ate);
+  assert(fbb.good() && "cluster desc does not exist!");
+  std::streampos size = fbb.tellg();
+  fbb.seekg(0, std::ios::beg);
+  auto buffer = std::shared_ptr<void>(std::malloc(size), std::free);
+  fbb.read(static_cast<char *>(buffer.get()), size);
+
+  // Read relevant information from binary
+  auto binary_system_desc =
+      ::tt::target::GetSizePrefixedSystemDescRoot(buffer.get())->system_desc();
+  auto const *binary_chip_desc = binary_system_desc->chip_descs();
+  auto const *binary_chip_desc_indices =
+      binary_system_desc->chip_desc_indices();
+  auto const *chip_capabilities = binary_system_desc->chip_capabilities();
+  auto const *binary_chip_coords = binary_system_desc->chip_coords();
+
+  // Acquire chip descs
+  std::vector<tt::ChipDescAttr> chip_desc_list;
+  for (auto element : *binary_chip_desc) {
+    auto current_chip_desc_attr = tt::ChipDescAttr::get(
+        context, tt::ArchAttr::get(context, tt::Arch::WormholeB0),
+        {element->grid_size()->y(), element->grid_size()->x()},
+        element->l1_size(), element->num_dram_channels(),
+        element->dram_channel_size(), element->noc_l1_address_align_bytes(),
+        element->pcie_address_align_bytes(),
+        element->noc_dram_address_align_bytes());
+
+    chip_desc_list.push_back(current_chip_desc_attr);
+  }
+
+  // Acquire chip indices
+  std::vector<uint32_t> chip_indices_list;
+  for (auto element : *binary_chip_desc_indices) {
+    chip_indices_list.push_back(element);
+  }
+
+  // Acquire chip capabilities
+  std::vector<tt::ChipCapabilityAttr> chip_capabilities_list;
+  for (auto element : *chip_capabilities) {
+    static_assert(
+        static_cast<std::underlying_type_t<::tt::target::ChipCapability>>(
+            ::mlir::tt::ChipCapability::PCIE) ==
+        static_cast<std::underlying_type_t<::tt::target::ChipCapability>>(
+            ::tt::target::ChipCapability::PCIE));
+    static_assert(
+        static_cast<std::underlying_type_t<::tt::target::ChipCapability>>(
+            ::mlir::tt::ChipCapability::HostMMIO) ==
+        static_cast<std::underlying_type_t<::tt::target::ChipCapability>>(
+            ::tt::target::ChipCapability::HostMMIO));
+
+    auto chip_capabilities_attr = tt::ChipCapabilityAttr::get(
+        context, static_cast<::mlir::tt::ChipCapability>(element));
+    chip_capabilities_list.push_back(chip_capabilities_attr);
+  }
+
+  // Acquire chip coordinates
+  std::vector<tt::ChipCoordAttr> chip_coordinate_list;
+  for (auto element : *binary_chip_coords) {
+    auto chip_coordinate_attr = tt::ChipCoordAttr::get(
+        context, element->rack(), element->shelf(), element->y(), element->x());
+    chip_coordinate_list.push_back(chip_coordinate_attr);
+  }
+
+  // Generate system desc attribute
+  auto system_desc_attr =
+      tt::SystemDescAttr::get(context, chip_desc_list, chip_indices_list,
+                              chip_capabilities_list, chip_coordinate_list, {});
+
+  return system_desc_attr;
+}
+
 static mlir::MemRefType buildMemRef(::mlir::MLIRContext *context,
                                     ::llvm::ArrayRef<int64_t> shardShape,
                                     ::mlir::Type elementType,
                                     MemorySpace memorySpace) {
-  if (elementType.isa<TileType>()) {
-    shardShape = elementType.cast<TileType>().getTiledShape(
-        ::llvm::SmallVector<int64_t>(shardShape));
+  ::llvm::SmallVector<int64_t> scalarShardShape(shardShape);
+  if (mlir::isa<TileType>(elementType)) {
+    scalarShardShape =
+        mlir::cast<TileType>(elementType).getTiledShape(scalarShardShape);
   }
   return mlir::MemRefType::get(
-      shardShape, elementType,
-      mlir::AffineMap::getMultiDimIdentityMap(shardShape.size(), context),
+      scalarShardShape, elementType,
+      mlir::AffineMap::getMultiDimIdentityMap(scalarShardShape.size(), context),
       MemorySpaceAttr::get(context, memorySpace));
 }
 
@@ -245,8 +324,8 @@ LayoutAttr::getStride(ArrayRef<int64_t> logicalShape) const {
 llvm::SmallVector<int64_t> LayoutAttr::getShardShape() const {
   SmallVector<int64_t> shardShape(getMemref().getShape());
   auto elementType = getElementType();
-  if (elementType.isa<TileType>()) {
-    return elementType.cast<TileType>().getScalarShape(shardShape);
+  if (mlir::isa<TileType>(elementType)) {
+    return mlir::cast<TileType>(elementType).getScalarShape(shardShape);
   }
   return shardShape;
 }
@@ -277,12 +356,76 @@ LayoutAttr LayoutAttr::withElementType(::mlir::MLIRContext *context,
 }
 
 MemorySpace LayoutAttr::getMemorySpace() const {
-  return getMemref()
-      .getMemorySpace()
-      .template cast<mlir::tt::MemorySpaceAttr>()
+  return mlir::cast<mlir::tt::MemorySpaceAttr>(getMemref().getMemorySpace())
       .getValue();
 }
 
+DeviceAttr DeviceAttr::get(::mlir::MLIRContext *context,
+                           SystemDescAttr systemDesc,
+                           ArrayRef<unsigned> chipIds) {
+  assert(not chipIds.empty() && "expected at least one chip");
+  assert(chipIds.size() == 1 && "only single chip supported for now");
+  ChipDescAttr chipDesc = systemDesc.getChipDescs()[chipIds.front()];
+  ArrayRef<int64_t> physicalGrid(chipDesc.getGrid());
+  assert(physicalGrid.size() == 2 && "expected 2D grid");
+  for (unsigned chipId : chipIds) {
+    ChipDescAttr chip = systemDesc.getChipDescs()[chipId];
+    if (chip.getGrid() != physicalGrid) {
+      llvm::report_fatal_error("all chips must have the same grid shape");
+    }
+  }
+
+  SmallVector<int64_t> virtualGrid(physicalGrid);
+  if (chipIds.size() > 1) {
+    virtualGrid.insert(virtualGrid.begin(), chipIds.size());
+  }
+  assert(virtualGrid.size() >= 2 && "expected at least 2D grid");
+
+  // auto c0 = getAffineConstantExpr(physicalGrid[0], context);
+  // auto c1 = getAffineConstantExpr(physicalGrid[1], context);
+  auto dZ = getAffineConstantExpr(0, context);
+  auto dY = getAffineDimExpr(virtualGrid.size() - 2, context);
+  auto dX = getAffineDimExpr(virtualGrid.size() - 1, context);
+
+  SmallVector<AffineExpr> gridExprs = {dZ, dY, dX};
+  auto gridMap = AffineMap::get(virtualGrid.size(), 0, gridExprs, context);
+
+  return get(context, GridAttr::get(context, virtualGrid, gridMap), chipIds);
+}
+
+DeviceAttr DeviceAttr::get(::mlir::MLIRContext *context,
+                           SystemDescAttr systemDesc) {
+  SmallVector<unsigned> chipIds(systemDesc.getChipDescIndices().size());
+  std::iota(chipIds.begin(), chipIds.end(), 0);
+  return get(context, systemDesc, chipIds);
+}
+
+::mlir::LogicalResult
+DeviceAttr::verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
+                   ::mlir::tt::GridAttr grid,
+                   ::llvm::ArrayRef<unsigned> chipIds) {
+  if (chipIds.empty()) {
+    emitError() << "expected at least one chip";
+    return ::mlir::failure();
+  }
+
+  auto gridShape = grid.getShape();
+  for (auto dim : gridShape) {
+    if (dim <= 0) {
+      emitError() << "expected positive grid dimensions";
+      return ::mlir::failure();
+    }
+  }
+
+  auto physicalGridMapping = grid.getMapping();
+  if (physicalGridMapping.getNumResults() != 3) {
+    emitError() << "expected physical grid mapping to have 3 results";
+    return ::mlir::failure();
+  }
+
+  return ::mlir::success();
+}
+
 llvm::SmallVector<int64_t>
 TileType::getScalarShape(SmallVector<int64_t> tiledShape) const {
   assert(tiledShape.size() >= 2 && "expected at least 2D shape");
@@ -301,6 +444,51 @@ TileType::getTiledShape(SmallVector<int64_t> scalarShape) const {
   return scalarShape;
 }
 
+uint64_t TileType::getSizeBytes() const {
+  switch (getDataType()) {
+  case DataType::Float32:
+    return getHeight() * getWidth() * 4;
+  case DataType::Float16:
+    return getHeight() * getWidth() * 2;
+  case DataType::BFloat16:
+    return getHeight() * getWidth() * 2;
+  case DataType::BFP_Float8:
+    assert(getHeight() == 32 && getWidth() == 32);
+    return 1024;
+  case DataType::BFP_BFloat8:
+    assert(getHeight() == 32 && getWidth() == 32);
+    return 1024;
+  case DataType::BFP_Float4:
+    assert(getHeight() == 32 && getWidth() == 32);
+    return 512;
+  case DataType::BFP_BFloat4:
+    assert(getHeight() == 32 && getWidth() == 32);
+    return 512;
+  case DataType::BFP_Float2:
+    assert(getHeight() == 32 && getWidth() == 32);
+    return 256;
+  case DataType::BFP_BFloat2:
+    assert(getHeight() == 32 && getWidth() == 32);
+    return 256;
+  case DataType::UInt32:
+    return getHeight() * getWidth() * 4;
+  case DataType::UInt16:
+    return getHeight() * getWidth() * 2;
+  case DataType::UInt8:
+    return getHeight() * getWidth();
+  }
+}
+
+DeviceAttr mlir::tt::getCurrentScopeDevice(mlir::Operation *op) {
+  while (op) {
+    if (auto device = op->getAttrOfType<DeviceAttr>(DeviceAttr::name)) {
+      return device;
+    }
+    op = op->getParentOp();
+  }
+  return nullptr;
+}
+
 void TTDialect::registerTypes() {
   // NOLINTNEXTLINE
   addTypes<
diff --git a/lib/Dialect/TTIR/Analysis/CMakeLists.txt b/lib/Dialect/TTIR/Analysis/CMakeLists.txt
index d747e6deee..4351982039 100644
--- a/lib/Dialect/TTIR/Analysis/CMakeLists.txt
+++ b/lib/Dialect/TTIR/Analysis/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRTTIRAnalysis
-        GridAnalysis.cpp
+        LegalGridAnalysis.cpp
+        OptimalTargetGridAnalysis.cpp
 
         ADDITIONAL_HEADER_DIRS
         ${PROJECT_SOURCE_DIR}/include/ttmlir
diff --git a/lib/Dialect/TTIR/Analysis/GridAnalysis.cpp b/lib/Dialect/TTIR/Analysis/GridAnalysis.cpp
deleted file mode 100644
index bdde174a49..0000000000
--- a/lib/Dialect/TTIR/Analysis/GridAnalysis.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "ttmlir/Dialect/TTIR/Analysis/GridAnalysis.h"
-
-namespace mlir::tt::ttir {
-
-bool GridAnalysis::applyOverrides() {
-  // Lookup grid size overrides based on location information for current
-  // operation.
-  //
-  if (analysis_input.grid_size_overrides && op->getLoc().isa<NameLoc>()) {
-    StringRef loc_str_op_name = op->getLoc().cast<NameLoc>().getName();
-    auto grid_override =
-        analysis_input.grid_size_overrides->find(loc_str_op_name);
-    if (grid_override != analysis_input.grid_size_overrides->end()) {
-      analysis_result.target_rows = grid_override->second[0];
-      analysis_result.target_columns = grid_override->second[1];
-      return true;
-    }
-  }
-
-  return false;
-}
-
-void GridAnalysis::analysisImplementation() {
-  // Placeholder. For now result of analysis is maximum supported grid size.
-  //
-  analysis_result.target_rows = analysis_input.max_supported_rows;
-  analysis_result.target_columns = analysis_input.max_supported_columns;
-}
-} // namespace mlir::tt::ttir
diff --git a/lib/Dialect/TTIR/Analysis/LegalGridAnalysis.cpp b/lib/Dialect/TTIR/Analysis/LegalGridAnalysis.cpp
new file mode 100644
index 0000000000..3a7987d7fe
--- /dev/null
+++ b/lib/Dialect/TTIR/Analysis/LegalGridAnalysis.cpp
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h"
+
+namespace mlir::tt::ttir {
+
+bool LegalGridAnalysis::applyOverrides() {
+  // Lookup grid size overrides based on location information for current
+  // operation.
+  //
+  if (analysisInput.gridSizeOverrides && isa<NameLoc>(op->getLoc())) {
+    StringRef loc_str_op_name = mlir::cast<NameLoc>(op->getLoc()).getName();
+    auto gridOverride = analysisInput.gridSizeOverrides->find(loc_str_op_name);
+    if (gridOverride != analysisInput.gridSizeOverrides->end()) {
+      analysisResult.push_back(GridAttr::get(
+          op->getContext(), ArrayRef<int64_t>(gridOverride->second)));
+      analysisResult.push_back(
+          GridAttr::get(op->getContext(),
+                        {gridOverride->second[0], gridOverride->second[1]}));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void LegalGridAnalysis::analysisImplementation() {
+  // Placeholder, needs to be implemented. Go through all the grid sizes and
+  // check if they are legal based on tensor type and device/chip attributes.
+  // For now result of analysis is maximum supported grid size.
+  //
+  analysisResult.push_back(
+      GridAttr::get(op->getContext(), analysisInput.maxGrid.getShape()));
+}
+} // namespace mlir::tt::ttir
diff --git a/lib/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.cpp b/lib/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.cpp
new file mode 100644
index 0000000000..1a9645ea71
--- /dev/null
+++ b/lib/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.cpp
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h"
+
+namespace mlir::tt::ttir {
+
+bool OptimalTargetGridAnalysis::applyOverrides() {
+
+  // Placeholder, no overrides for now.
+  //
+  return false;
+}
+
+void OptimalTargetGridAnalysis::analysisImplementation() {
+
+  // Implement GraphSolver like algorithm to eliminate illegal grid combinations
+  // from graph globaly.
+  // Entry point for graphsolving.
+  //
+
+  // Balancer/GridPicker implementation.
+  // Future entrypoint for balancing and picking optimal grid.
+  // Placeholder: pick the first legal grid.
+  //
+  for (auto opGrids : analysisInput.legalGrids) {
+    analysisResult[opGrids.first] = opGrids.second[0];
+  }
+}
+} // namespace mlir::tt::ttir
diff --git a/lib/Dialect/TTIR/IR/TTIROps.cpp b/lib/Dialect/TTIR/IR/TTIROps.cpp
index 0e0d6883d4..13eff68e7f 100644
--- a/lib/Dialect/TTIR/IR/TTIROps.cpp
+++ b/lib/Dialect/TTIR/IR/TTIROps.cpp
@@ -10,13 +10,13 @@
 #define GET_OP_CLASSES
 #include "ttmlir/Dialect/TTIR/IR/TTIROps.cpp.inc"
 
-::mlir::LogicalResult mlir::tt::ttir::LayoutOp::verify() {
+::mlir::LogicalResult mlir::tt::ttir::ToLayoutOp::verify() {
   ::mlir::RankedTensorType inputTy = getInput().getType();
   ::mlir::RankedTensorType outputTy = getOutput().getType();
   auto inputLayout =
-      inputTy.getEncoding().template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(inputTy.getEncoding());
   auto outputLayout =
-      outputTy.getEncoding().template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(outputTy.getEncoding());
   if (not inputLayout) {
     return emitOpError("Input tensor type missing layout attribute");
   }
@@ -84,10 +84,8 @@ ::mlir::LogicalResult mlir::tt::ttir::MatmulOp::verify() {
 // ANCHOR_END: adding_an_op_matmul_ttir_verify
 
 ::mlir::LogicalResult mlir::tt::ttir::AllocOp::verify() {
-  auto layout = getResult()
-                    .getType()
-                    .getEncoding()
-                    .template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+  auto layout = mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(
+      getResult().getType().getEncoding());
   if (not layout) {
     return emitOpError("Result type missing layout attribute");
   }
@@ -97,9 +95,8 @@ ::mlir::LogicalResult mlir::tt::ttir::AllocOp::verify() {
   }
 
   auto memref = layout.getMemref();
-  auto memspace = memref.getMemorySpace()
-                      .template cast<mlir::tt::MemorySpaceAttr>()
-                      .getValue();
+  auto memspace =
+      mlir::cast<mlir::tt::MemorySpaceAttr>(memref.getMemorySpace()).getValue();
   if (memspace != getMemorySpace()) {
     return emitOpError(
         "Input tensor layout memory space must match alloc memory space");
diff --git a/lib/Dialect/TTIR/Transforms/Passes.cpp b/lib/Dialect/TTIR/Transforms/Passes.cpp
index c02ef560a5..7d8d84a6bd 100644
--- a/lib/Dialect/TTIR/Transforms/Passes.cpp
+++ b/lib/Dialect/TTIR/Transforms/Passes.cpp
@@ -18,73 +18,38 @@
 #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
 #include "ttmlir/Dialect/TTIR/IR/TTIROps.h"
 
-#include "ttmlir/Dialect/TTIR/Analysis/GridAnalysis.h"
+#include "ttmlir/Dialect/TTIR/Analysis/LegalGridAnalysis.h"
+#include "ttmlir/Dialect/TTIR/Analysis/OptimalTargetGridAnalysis.h"
 #include "ttmlir/Dialect/TTIR/Transforms/Passes.h"
 
 namespace mlir::tt::ttir {
-#define GEN_PASS_DEF_CONVERTTOSATOTTIR
 #define GEN_PASS_DEF_TTIRGENERIC
 #define GEN_PASS_DEF_TTIRGENERICREGIONOPERANDSTOMEMREF
 #define GEN_PASS_DEF_TTIRLAYOUT
 #define GEN_PASS_DEF_TTIRALLOCATE
 #define GEN_PASS_DEF_TTIRGRIDSET
+#define GEN_PASS_DEF_TTIRIMPLICITDEVICE
+#define GEN_PASS_DEF_TTIRLOADSYSTEMDESC
 #include "ttmlir/Dialect/TTIR/Transforms/Passes.h.inc"
 
-template <typename TosaOp, typename TTIROp,
-          OperandConstraint operandConstraints>
-class TosaToTTIREltwiseBinaryRewriter : public OpRewritePattern<TosaOp> {
+class TTIRImplicitDevice
+    : public impl::TTIRImplicitDeviceBase<TTIRImplicitDevice> {
 public:
-  using OpRewritePattern<TosaOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TosaOp op,
-                                PatternRewriter &rewriter) const final {
-    if constexpr (std::is_same<TosaOp, tosa::MulOp>::value) {
-      assert(op.getShift() == 0);
-    }
-
-    // Create empty output tensor for destination passing style (DPS)
-    auto outputType =
-        op.getResult().getType().template cast<RankedTensorType>();
-    auto output = rewriter.create<tensor::EmptyOp>(
-        op.getLoc(), outputType.getShape(), outputType.getElementType());
-    rewriter.replaceOpWithNewOp<TTIROp>(
-        op, TypeRange(output.getType()), op.getOperands(), ValueRange(output),
-        rewriter.getArrayAttr(SmallVector<Attribute>(
-            op.getNumOperands() + 1, // +1 for output operand
-            rewriter.getAttr<OperandConstraintAttr>(operandConstraints))));
-
-    return success();
-  }
-};
-
-class ConvertTosaToTTIR
-    : public impl::ConvertTosaToTTIRBase<ConvertTosaToTTIR> {
-public:
-  using impl::ConvertTosaToTTIRBase<ConvertTosaToTTIR>::ConvertTosaToTTIRBase;
+  using impl::TTIRImplicitDeviceBase<
+      TTIRImplicitDevice>::TTIRImplicitDeviceBase;
   void runOnOperation() final {
     ModuleOp module = getOperation();
 
-    if (not module->hasAttr(tt::SystemDescAttr::name)) {
-      module->setAttr(tt::SystemDescAttr::name,
-                      tt::SystemDescAttr::getDefault(&getContext()));
-    }
-
-    RewritePatternSet patterns(&getContext());
-    patterns.add<TosaToTTIREltwiseBinaryRewriter<tosa::AddOp, ttir::AddOp,
-                                                 OperandConstraint::AnyDevice>,
-                 TosaToTTIREltwiseBinaryRewriter<tosa::MulOp, ttir::MultiplyOp,
-                                                 OperandConstraint::AnyDevice>,
-                 TosaToTTIREltwiseBinaryRewriter<tosa::SubOp, ttir::SubtractOp,
-                                                 OperandConstraint::AnyDevice>,
-                 TosaToTTIREltwiseBinaryRewriter<tosa::GreaterEqualOp,
-                                                 ttir::GreaterEqualOp,
-                                                 OperandConstraint::AnyDevice>>(
-        &getContext());
-    FrozenRewritePatternSet patternSet(std::move(patterns));
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) {
-      signalPassFailure();
+    if (not module->hasAttr(tt::DeviceAttr::name)) {
+      assert(module->hasAttr(tt::SystemDescAttr::name));
+      auto systemDesc = module->getAttr(tt::SystemDescAttr::name);
+      module->setAttr(
+          tt::DeviceAttr::name,
+          tt::DeviceAttr::get(&getContext(),
+                              mlir::cast<tt::SystemDescAttr>(systemDesc)));
     }
   }
+
   void getDependentDialects(mlir::DialectRegistry &registry) const override {
     registry.insert<mlir::tt::ttir::TTIRDialect>();
   }
@@ -100,28 +65,28 @@ class TTIRLinalgGenericRewriter : public OpRewritePattern<linalg::GenericOp> {
   }
 };
 
-template <typename TTIROp>
-class TTIRNamedToKernelRewriter : public OpRewritePattern<TTIROp> {
+template <typename TTIROpTy>
+class TTIRNamedToKernelRewriter : public OpRewritePattern<TTIROpTy> {
 public:
-  using OpRewritePattern<TTIROp>::OpRewritePattern;
+  using OpRewritePattern<TTIROpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TTIROp op,
+  LogicalResult matchAndRewrite(TTIROpTy op,
                                 PatternRewriter &rewriter) const final {
     StringRef kernelName;
     StringRef kernelKind;
-    if constexpr (std::is_same<TTIROp, ttir::MultiplyOp>::value) {
+    if constexpr (std::is_same<TTIROpTy, ttir::MultiplyOp>::value) {
       kernelName = "mulitply";
       kernelKind = "eltwise";
-    } else if constexpr (std::is_same<TTIROp, ttir::AddOp>::value) {
+    } else if constexpr (std::is_same<TTIROpTy, ttir::AddOp>::value) {
       kernelName = "add";
       kernelKind = "eltwise";
-    } else if constexpr (std::is_same<TTIROp, ttir::SubtractOp>::value) {
+    } else if constexpr (std::is_same<TTIROpTy, ttir::SubtractOp>::value) {
       kernelName = "subtract";
       kernelKind = "eltwise";
-    } else if constexpr (std::is_same<TTIROp, ttir::GreaterEqualOp>::value) {
+    } else if constexpr (std::is_same<TTIROpTy, ttir::GreaterEqualOp>::value) {
       kernelName = "ge";
       kernelKind = "eltwise";
-    } else if constexpr (std::is_same<TTIROp, ttir::ReluOp>::value) {
+    } else if constexpr (std::is_same<TTIROpTy, ttir::ReluOp>::value) {
       kernelName = "relu";
       kernelKind = "eltwise";
     } else {
@@ -148,9 +113,9 @@ class TTIRKernelGenericRewriter : public OpRewritePattern<KernelOp> {
     if (operands.empty()) {
       return false;
     }
-    auto rank = operands[0].getType().cast<RankedTensorType>().getRank();
+    auto rank = mlir::cast<RankedTensorType>(operands[0].getType()).getRank();
     for (auto operand : operands) {
-      if (operand.getType().cast<RankedTensorType>().getRank() != rank) {
+      if (mlir::cast<RankedTensorType>(operand.getType()).getRank() != rank) {
         return false;
       }
     }
@@ -162,7 +127,7 @@ class TTIRKernelGenericRewriter : public OpRewritePattern<KernelOp> {
                             mlir::OperandRange operands) {
     assert(sameRank(operands) &&
            "For now all operands must have the same rank");
-    auto rank = operands[0].getType().cast<RankedTensorType>().getRank();
+    auto rank = mlir::cast<RankedTensorType>(operands[0].getType()).getRank();
     SmallVector<AffineMap> indexingMaps(operands.size(),
                                         rewriter.getMultiDimIdentityMap(rank));
     SmallVector<Attribute> iteratorTypes(
@@ -176,7 +141,7 @@ class TTIRKernelGenericRewriter : public OpRewritePattern<KernelOp> {
                            mlir::OperandRange operands) {
     assert(sameRank(operands) &&
            "For now all operands must have the same rank");
-    auto rank = operands[0].getType().cast<RankedTensorType>().getRank();
+    auto rank = mlir::cast<RankedTensorType>(operands[0].getType()).getRank();
     assert(rank >= 2 && "Matmul requires rank >= 2");
     auto rank_plus_inner_dim = rank + 1;
 
@@ -299,26 +264,22 @@ class TTIRGenericOperandsToMemrefRewriter : public OpRewritePattern<GenericOp> {
                                 PatternRewriter &rewriter) const final {
     auto dpsInterface = cast<DestinationStyleOpInterface>(op.getOperation());
     for (auto &operand : op->getOpOperands()) {
-      auto encoding = operand.get()
-                          .getType()
-                          .template cast<RankedTensorType>()
-                          .getEncoding();
+      auto encoding =
+          mlir::cast<RankedTensorType>(operand.get().getType()).getEncoding();
       if (not encoding) {
         return failure(); // Hasn't been type converted yet
       }
 
       auto blockArg = op.getRegion().getArgument(operand.getOperandNumber());
-      if (blockArg.getType().isa<MemRefType>()) {
+      if (mlir::isa<MemRefType>(blockArg.getType())) {
         return failure(); // Already lowered
       }
 
       // Rewire the operand to the layout op
       rewriter.modifyOpInPlace(op, [&]() {
-        auto layout = operand.get()
-                          .getType()
-                          .template cast<RankedTensorType>()
-                          .getEncoding()
-                          .template cast<LayoutAttr>();
+        auto layout = mlir::cast<LayoutAttr>(
+            mlir::cast<RankedTensorType>(operand.get().getType())
+                .getEncoding());
 
         blockArg.setType(layout.getMemref());
 
@@ -356,7 +317,7 @@ class TTIRGenericRegionOperandsToMemref
 };
 
 inline MemorySpace getMemorySpace(MemRefType memref) {
-  return memref.getMemorySpace().template cast<MemorySpaceAttr>().getValue();
+  return mlir::cast<MemorySpaceAttr>(memref.getMemorySpace()).getValue();
 }
 
 inline MemorySpace getMemorySpace(LayoutAttr layout) {
@@ -365,7 +326,7 @@ inline MemorySpace getMemorySpace(LayoutAttr layout) {
 
 inline MemorySpace getMemorySpace(RankedTensorType ty) {
   assert(ty.getEncoding());
-  auto layout = ty.getEncoding().template cast<LayoutAttr>();
+  auto layout = mlir::cast<LayoutAttr>(ty.getEncoding());
   return getMemorySpace(layout);
 }
 
@@ -454,10 +415,10 @@ class TTIRLayoutTensorTypeRewriter : public RewritePattern {
 };
 
 static std::optional<Value>
-createLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
-               OperandConstraint operandConstraint) {
-  auto ty = input.getType().cast<RankedTensorType>();
-  auto currLayout = ty.getEncoding().cast<LayoutAttr>();
+createToLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
+                 OperandConstraint operandConstraint) {
+  auto ty = mlir::cast<RankedTensorType>(input.getType());
+  auto currLayout = mlir::cast<LayoutAttr>(ty.getEncoding());
   auto currMemorySpace = currLayout.getMemorySpace();
   auto desiredMemorySpace = uppermostMemorySpace(operandConstraint);
   if (currMemorySpace == desiredMemorySpace) {
@@ -473,36 +434,35 @@ createLayoutOp(PatternRewriter &rewriter, Location loc, Value input,
     rewriter.replaceOp(exising_empty, output);
     return output.getResult();
   }
-  return rewriter.create<ttir::LayoutOp>(loc, output.getType(), input, output)
+  return rewriter
+      .create<ttir::ToLayoutOp>(loc, output.getType(), input, output)
       ->getResult(0);
 }
 
-template <typename TTIROp>
-class TTIRLayoutOperandsRewriter : public OpRewritePattern<TTIROp> {
+template <typename TTIROpTy>
+class TTIRLayoutOperandsRewriter : public OpRewritePattern<TTIROpTy> {
 public:
-  using OpRewritePattern<TTIROp>::OpRewritePattern;
+  using OpRewritePattern<TTIROpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TTIROp op,
+  LogicalResult matchAndRewrite(TTIROpTy op,
                                 PatternRewriter &rewriter) const final {
-    assert(op->template hasTrait<TTIROpInterface::Trait>());
+    assert(op->template hasTrait<TTIROp::Trait>());
     auto dpsInterface = cast<DestinationStyleOpInterface>(op.getOperation());
     bool modified = false;
     for (auto &operand : op->getOpOperands()) {
       bool isResult = dpsInterface.isDpsInit(&operand);
-      auto encoding = operand.get()
-                          .getType()
-                          .template cast<RankedTensorType>()
-                          .getEncoding();
+      auto encoding =
+          mlir::cast<RankedTensorType>(operand.get().getType()).getEncoding();
       if (not encoding) {
         return failure(); // Hasn't been type converted yet
       }
 
       auto operandConstraint =
-          op.getOperandConstraints()[operand.getOperandNumber()]
-              .template cast<OperandConstraintAttr>()
+          mlir::cast<OperandConstraintAttr>(
+              op.getOperandConstraints()[operand.getOperandNumber()])
               .getValue();
-      auto desiredLayout = createLayoutOp(rewriter, op.getLoc(), operand.get(),
-                                          operandConstraint);
+      auto desiredLayout = createToLayoutOp(rewriter, op.getLoc(),
+                                            operand.get(), operandConstraint);
 
       if (desiredLayout) {
         rewriter.modifyOpInPlace(op, [&]() {
@@ -540,8 +500,8 @@ class TTIRLayoutFuncReturnRewriter
                                 PatternRewriter &rewriter) const final {
     bool modified = false;
     for (auto &operand : op->getOpOperands()) {
-      if (auto layout = createLayoutOp(rewriter, op.getLoc(), operand.get(),
-                                       OperandConstraint::System);
+      if (auto layout = createToLayoutOp(rewriter, op.getLoc(), operand.get(),
+                                         OperandConstraint::System);
           layout) {
         rewriter.modifyOpInPlace(
             op, [&]() { op.setOperand(operand.getOperandNumber(), *layout); });
@@ -576,6 +536,7 @@ class TTIRLayout : public impl::TTIRLayoutBase<TTIRLayout> {
           TTIRLayoutOperandsRewriter<SubtractOp>,
           TTIRLayoutOperandsRewriter<GreaterEqualOp>,
           TTIRLayoutOperandsRewriter<ReluOp>, TTIRLayoutOperandsRewriter<SumOp>,
+          TTIRLayoutOperandsRewriter<MeanOp>,
           TTIRLayoutOperandsRewriter<SoftmaxOp>,
           TTIRLayoutOperandsRewriter<MatmulOp>, TTIRLayoutFuncReturnRewriter>(
           &getContext());
@@ -607,18 +568,14 @@ inline uint64_t getMemrefSizeBytes(MemRefType ty) {
   return size;
 }
 
-inline uint64_t getLayoutSizeBytes(LayoutAttr layout) {
-  auto gridShape = layout.getGrid().getShape();
-  auto gridVolume = std::accumulate(gridShape.begin(), gridShape.end(), 1,
-                                    std::multiplies<uint64_t>());
-  assert(gridVolume == 1 && "Only support grid shape of 1 for now");
+inline uint64_t getLayoutMemrefSizeBytes(LayoutAttr layout) {
   return getMemrefSizeBytes(layout.getMemref());
 }
 
-inline uint64_t getTensorSizeBytes(RankedTensorType ty) {
+inline uint64_t getTensorMemrefSizeBytes(RankedTensorType ty) {
   assert(ty.getEncoding());
-  auto layout = ty.getEncoding().template cast<LayoutAttr>();
-  return getLayoutSizeBytes(layout);
+  auto layout = mlir::cast<LayoutAttr>(ty.getEncoding());
+  return getLayoutMemrefSizeBytes(layout);
 }
 
 class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {
@@ -687,7 +644,7 @@ class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {
           liveness.getLiveness(&func.getBody().front());
       func->walk([&](tensor::EmptyOp empty) {
         auto resultTy =
-            empty.getResult().getType().template cast<RankedTensorType>();
+            mlir::cast<RankedTensorType>(empty.getResult().getType());
         assert(resultTy.getEncoding());
 
         auto [startOp, endOp] =
@@ -695,7 +652,7 @@ class TTIRAllocate : public impl::TTIRAllocateBase<TTIRAllocate> {
 
         // Replace empty with allocate
         auto memorySpace = getMemorySpace(resultTy);
-        auto sizeBytes = getTensorSizeBytes(resultTy);
+        auto sizeBytes = getTensorMemrefSizeBytes(resultTy);
         auto address = allocator.allocate(sizeBytes, memorySpace);
         rewriter.setInsertionPoint(startOp);
         auto alloc = rewriter.create<AllocOp>(startOp->getLoc(), resultTy,
@@ -723,16 +680,43 @@ class TTIRGridSet : public impl::TTIRGridSetBase<TTIRGridSet> {
     // - Constraint checking, whether the grid size is supported by the current
     // OP based on inputs and op type.
     //
-    ModuleOp module_op = getOperation();
+    ModuleOp moduleOp = getOperation();
 
     // Get the max grid size from the system description.
     //
-    assert(module_op->hasAttr(tt::SystemDescAttr::name));
-    GridAttr max_grid = module_op->getAttr(tt::SystemDescAttr::name)
-                            .cast<tt::SystemDescAttr>()
-                            .getChipDescs()[0]
-                            .getGrid();
-    module_op->walk([&](func::FuncOp func) {
+    assert(moduleOp->hasAttr(tt::DeviceAttr::name));
+    GridAttr max_grid =
+        mlir::cast<tt::DeviceAttr>(moduleOp->getAttr(tt::DeviceAttr::name))
+            .getGrid();
+
+    SystemDescAttr systemDesc = mlir::cast<tt::SystemDescAttr>(
+        moduleOp->getAttr(tt::SystemDescAttr::name));
+    ChipDescAttr chipDesc = systemDesc.getChipDescs()[0];
+    llvm::DenseMap<Operation *, std::vector<GridAttr>> legalGrids;
+
+    moduleOp->walk([&](Operation *op) {
+      if (op->getNumResults() == 0) {
+        return;
+      }
+
+      RankedTensorType tensorType =
+          mlir::cast<RankedTensorType>(op->getResult(0).getType());
+      LegalGridAnalysis legalGridAnalysis =
+          getChildAnalysis<LegalGridAnalysis>(op);
+      legalGridAnalysis.init(LegalGridAnalysisInput(
+          chipDesc, max_grid, tensorType, &overrideGridSizes));
+      legalGrids[op] = legalGridAnalysis.getResult();
+    });
+
+    OptimalTargetGridAnalysis optimalTargetGridAnalysis =
+        getAnalysis<OptimalTargetGridAnalysis>();
+    optimalTargetGridAnalysis.init(
+        OptimalTargetGridAnalysisInput(std::move(legalGrids)));
+
+    // Pure application of determined grid sizes to the operations.
+    // No further analysis.
+    //
+    moduleOp->walk([&](func::FuncOp func) {
       SmallVector<Type> funcResultTypes;
       func->walk([&](Operation *op) {
         if (op->getNumResults() == 0) {
@@ -744,44 +728,47 @@ class TTIRGridSet : public impl::TTIRGridSetBase<TTIRGridSet> {
           return;
         }
 
-        GridAnalysis &grid_analysis = getChildAnalysis<GridAnalysis>(op);
-
-        // Initialize the grid analysis with the max grid size.
-        //
-        grid_analysis.init(GridAnalysisInput(max_grid.getShape()[0],
-                                             max_grid.getShape()[1],
-                                             &overrideGridSizes));
-
-        // Run the grid analysis and get the result.
-        //
-        GridAnalysisResult grid_analysis_result = grid_analysis.getResult();
-
-        RankedTensorType tensor_type =
-            op->getResult(0).getType().template cast<RankedTensorType>();
-        LayoutAttr layout =
-            tensor_type.getEncoding().template cast<LayoutAttr>();
-        llvm::ArrayRef<int64_t> tensor_shape = tensor_type.getShape();
+        RankedTensorType tensorType =
+            mlir::cast<RankedTensorType>(op->getResult(0).getType());
+        LayoutAttr layout = mlir::cast<LayoutAttr>(tensorType.getEncoding());
+        llvm::ArrayRef<int64_t> tensorShape = tensorType.getShape();
 
         // Update the output layout attribute with the new grid size.
         //
         op->getResult(0).setType(RankedTensorType::get(
-            tensor_shape, tensor_type.getElementType(),
-            layout.withGrid(
-                &getContext(), tensor_shape,
-                GridAttr::get(&getContext(),
-                              {grid_analysis_result.target_rows,
-                               grid_analysis_result.target_columns}))));
+            tensorShape, tensorType.getElementType(),
+            layout.withGrid(&getContext(), tensorShape,
+                            optimalTargetGridAnalysis.getResult().at(op))));
       });
 
       // Update the function type to reflect the updated return operation's
       // result types.
       //
-      FunctionType func_type = func.getFunctionType();
+      FunctionType funcType = func.getFunctionType();
       FunctionType newFuncType = FunctionType::get(
-          func.getContext(), func_type.getInputs(), funcResultTypes);
+          func.getContext(), funcType.getInputs(), funcResultTypes);
       func.setType(newFuncType);
     });
   }
 };
 
+class TTIRLoadSystemDesc
+    : public impl::TTIRLoadSystemDescBase<TTIRLoadSystemDesc> {
+public:
+  using impl::TTIRLoadSystemDescBase<
+      TTIRLoadSystemDesc>::TTIRLoadSystemDescBase;
+
+  void runOnOperation() final {
+    ModuleOp module = getOperation();
+
+    if (not path.empty()) {
+      module->setAttr(tt::SystemDescAttr::name,
+                      tt::SystemDescAttr::getFromPath(&getContext(), path));
+    } else if (not module->hasAttr(tt::SystemDescAttr::name)) {
+      module->setAttr(tt::SystemDescAttr::name,
+                      tt::SystemDescAttr::getDefault(&getContext()));
+    }
+  }
+};
+
 } // namespace mlir::tt::ttir
diff --git a/lib/Dialect/TTKernel/IR/TTKernelOps.cpp b/lib/Dialect/TTKernel/IR/TTKernelOps.cpp
index de994573f7..bb12d07fe8 100644
--- a/lib/Dialect/TTKernel/IR/TTKernelOps.cpp
+++ b/lib/Dialect/TTKernel/IR/TTKernelOps.cpp
@@ -46,7 +46,7 @@ static ttkernel::ThreadType getRegionThreadType(mlir::Region *region) {
   } else {
     assert(false && "Unexpected parent op in getRegionThreadType");
   }
-  return threadType.cast<ttkernel::ThreadTypeAttr>().getValue();
+  return mlir::cast<ttkernel::ThreadTypeAttr>(threadType).getValue();
 }
 
 static bool insideThread(mlir::Operation *op, ttkernel::ThreadType threadType) {
diff --git a/lib/Dialect/TTMetal/IR/TTMetalOps.cpp b/lib/Dialect/TTMetal/IR/TTMetalOps.cpp
index 17c16f8639..97fd6d537a 100644
--- a/lib/Dialect/TTMetal/IR/TTMetalOps.cpp
+++ b/lib/Dialect/TTMetal/IR/TTMetalOps.cpp
@@ -18,12 +18,12 @@ ::mlir::LogicalResult HostWriteOp::verify() {
   ::mlir::RankedTensorType inputTy = getInput().getType();
   ::mlir::RankedTensorType outputTy = getOutput().getType();
   auto inputLayout =
-      inputTy.getEncoding().template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(inputTy.getEncoding());
   if (not inputLayout) {
     return emitOpError("Input tensor missing layout attribute");
   }
   auto outputLayout =
-      outputTy.getEncoding().template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(outputTy.getEncoding());
   if (not outputLayout) {
     return emitOpError("Input tensor missing layout attribute");
   }
@@ -40,12 +40,12 @@ ::mlir::LogicalResult HostReadOp::verify() {
   ::mlir::RankedTensorType inputTy = getInput().getType();
   ::mlir::RankedTensorType outputTy = getOutput().getType();
   auto inputLayout =
-      inputTy.getEncoding().template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(inputTy.getEncoding());
   if (not inputLayout) {
     return emitOpError("Input tensor missing layout attribute");
   }
   auto outputLayout =
-      outputTy.getEncoding().template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(outputTy.getEncoding());
   if (not outputLayout) {
     return emitOpError("Input tensor missing layout attribute");
   }
@@ -59,10 +59,8 @@ ::mlir::LogicalResult HostReadOp::verify() {
 }
 
 ::mlir::LogicalResult AllocOp::verify() {
-  auto layout = getResult()
-                    .getType()
-                    .getEncoding()
-                    .template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+  auto layout = mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(
+      getResult().getType().getEncoding());
   if (not layout) {
     return emitOpError("Result type missing layout attribute");
   }
@@ -72,9 +70,8 @@ ::mlir::LogicalResult AllocOp::verify() {
   }
 
   auto memref = layout.getMemref();
-  auto memspace = memref.getMemorySpace()
-                      .template cast<mlir::tt::MemorySpaceAttr>()
-                      .getValue();
+  auto memspace =
+      mlir::cast<mlir::tt::MemorySpaceAttr>(memref.getMemorySpace()).getValue();
   if (memspace != getMemorySpace()) {
     return emitOpError(
         "Input tensor layout memory space must match alloc memory space");
@@ -97,10 +94,8 @@ ::mlir::LogicalResult AllocOp::verify() {
 ::mlir::LogicalResult DispatchOp::verify() {
   // Assert inputs/outputs device memspace
   for (auto operand : getOperands()) {
-    auto layout = operand.getType()
-                      .cast<mlir::RankedTensorType>()
-                      .getEncoding()
-                      .template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+    auto layout = mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(
+        mlir::cast<mlir::RankedTensorType>(operand.getType()).getEncoding());
     if (not layout) {
       return emitOpError("Input tensor missing layout attribute");
     }
@@ -112,7 +107,7 @@ ::mlir::LogicalResult DispatchOp::verify() {
   // Assert block inputs are CBs
   for (auto &region : getRegions()) {
     for (auto arg : region.getArguments()) {
-      if (not arg.getType().isa<ttkernel::CBType>()) {
+      if (not mlir::isa<ttkernel::CBType>(arg.getType())) {
         return emitOpError("Block inputs must be CBType");
       }
     }
diff --git a/lib/Dialect/TTMetal/Transforms/KernelsToCpp.cpp b/lib/Dialect/TTMetal/Transforms/KernelsToCpp.cpp
index f63e25b9bd..0d98487314 100644
--- a/lib/Dialect/TTMetal/Transforms/KernelsToCpp.cpp
+++ b/lib/Dialect/TTMetal/Transforms/KernelsToCpp.cpp
@@ -98,7 +98,7 @@ LogicalResult emitDispatchOpRegionAsCpp(DispatchOp origOp,
   OpBuilder builder(op.getOperation());
 
   auto threadTypeAttr =
-      op.getThreadTypes()[regionNumber].cast<ttkernel::ThreadTypeAttr>();
+      mlir::cast<ttkernel::ThreadTypeAttr>(op.getThreadTypes()[regionNumber]);
 
   // Replace the original block with a the new block containing a module op
   auto module = builder.create<mlir::ModuleOp>(
diff --git a/lib/Dialect/TTMetal/Transforms/Passes.cpp b/lib/Dialect/TTMetal/Transforms/Passes.cpp
index 4cc5002e27..099318a22b 100644
--- a/lib/Dialect/TTMetal/Transforms/Passes.cpp
+++ b/lib/Dialect/TTMetal/Transforms/Passes.cpp
@@ -30,21 +30,21 @@ namespace mlir::tt::ttmetal {
 #define GEN_PASS_DEF_CONVERTTTIRTOTTMETAL
 #include "ttmlir/Dialect/TTMetal/Transforms/Passes.h.inc"
 
-class TTIRToTTMetalLayoutRewriter : public OpRewritePattern<ttir::LayoutOp> {
+class TTIRToTTMetalLayoutRewriter : public OpRewritePattern<ttir::ToLayoutOp> {
 public:
-  using OpRewritePattern<ttir::LayoutOp>::OpRewritePattern;
+  using OpRewritePattern<ttir::ToLayoutOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(ttir::LayoutOp op,
+  LogicalResult matchAndRewrite(ttir::ToLayoutOp op,
                                 PatternRewriter &rewriter) const final {
-    auto inputTy = op.getInput().getType().template cast<RankedTensorType>();
-    auto outputTy = op.getType().template cast<RankedTensorType>();
+    auto inputTy = mlir::cast<RankedTensorType>(op.getInput().getType());
+    auto outputTy = mlir::cast<RankedTensorType>(op.getType());
     if (not inputTy.getEncoding() || not outputTy.getEncoding()) {
       return failure();
     }
-    assert(inputTy.getEncoding().isa<tt::LayoutAttr>());
-    assert(outputTy.getEncoding().isa<tt::LayoutAttr>());
-    auto inputLayout = inputTy.getEncoding().template cast<tt::LayoutAttr>();
-    auto outputLayout = outputTy.getEncoding().template cast<tt::LayoutAttr>();
+    assert(mlir::isa<tt::LayoutAttr>(inputTy.getEncoding()));
+    assert(mlir::isa<tt::LayoutAttr>(outputTy.getEncoding()));
+    auto inputLayout = mlir::cast<tt::LayoutAttr>(inputTy.getEncoding());
+    auto outputLayout = mlir::cast<tt::LayoutAttr>(outputTy.getEncoding());
     if (inputLayout.isSystemMemorySpace()) {
       assert(outputLayout.isDeviceMemorySpace());
       rewriter.replaceOpWithNewOp<ttmetal::HostWriteOp>(
@@ -121,10 +121,10 @@ class TTIRToTTMetalDispatchRewriter : public OpRewritePattern<ttir::GenericOp> {
     SmallVector<Type> rewrittenBlockArgumentTypes;
     for (auto arg : blockArguments) {
       auto address = lookupAddress(arg);
-      auto port = operand_cb_port_mapping[arg.getArgNumber()]
-                      .cast<IntegerAttr>()
-                      .getInt();
-      auto memref = arg.getType().cast<MemRefType>();
+      auto port =
+          mlir::cast<IntegerAttr>(operand_cb_port_mapping[arg.getArgNumber()])
+              .getInt();
+      auto memref = mlir::cast<MemRefType>(arg.getType());
       rewrittenBlockArgumentTypes.push_back(
           rewriter.getType<ttkernel::CBType>(address, port, memref));
     }
diff --git a/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp b/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp
index 7efdb85369..59e4d7e144 100644
--- a/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp
+++ b/lib/Dialect/TTMetal/Transforms/SerializeToBinary.cpp
@@ -111,8 +111,8 @@ class TTMetalSerializeToBinary
     CQBuilder cqBuilder(&fbb);
 
     ModuleOp module = getOperation();
-    auto systemDesc =
-        module->getAttr(tt::SystemDescAttr::name).cast<tt::SystemDescAttr>();
+    auto systemDesc = mlir::cast<tt::SystemDescAttr>(
+        module->getAttr(tt::SystemDescAttr::name));
     func::FuncOp entry = dyn_cast<func::FuncOp>(*module.getRegion().op_begin());
     assert(entry && "expected an entry function");
     cqBuilder.name = entry.getSymName().data();
@@ -142,14 +142,13 @@ class TTMetalSerializeToBinary
           assert(succeeded(result) &&
                  "failed to emit dispatch op region as cpp");
           auto threadType =
-              dispatchOp.getThreadTypes()[region.getRegionNumber()]
-                  .cast<ttkernel::ThreadTypeAttr>()
+              mlir::cast<ttkernel::ThreadTypeAttr>(
+                  dispatchOp.getThreadTypes()[region.getRegionNumber()])
                   .getValue();
-          ::tt::target::Dim2dRange core_range =
-              toFlatbuffer(dispatchOp.getCoreRanges()[region.getRegionNumber()]
-                               .cast<CoreRangeAttr>());
-          ::tt::target::Dim2dRange(::tt::target::Dim2d(0, 0),
-                                   ::tt::target::Dim2d(0, 0));
+          std::vector<::tt::target::Dim2dRange> core_range = {
+              toFlatbuffer(mlir::cast<CoreRangeAttr>(
+                  dispatchOp.getCoreRanges()[region.getRegionNumber()])),
+          };
           std::vector<::flatbuffers::Offset<::tt::target::CBRef>> cbs;
           kernels.push_back(::tt::target::metal::CreateKernelDescDirect(
               fbb, ::tt::target::metal::Kernel::KernelSource,
@@ -158,19 +157,17 @@ class TTMetalSerializeToBinary
                   .Union(),
               &core_range, &cbs, nullptr /*TODO debug info*/));
         }
-        std::vector<::flatbuffers::Offset<::tt::target::metal::DispatchProgram>>
-            programs = {
-                ::tt::target::metal::CreateDispatchProgramDirect(fbb, &kernels),
-            };
+        ::flatbuffers::Offset<::tt::target::metal::ProgramDesc> program =
+            ::tt::target::metal::CreateProgramDescDirect(fbb, &kernels);
 
         cqBuilder.appendCommand(
-            ::tt::target::metal::CreateDispatchCommandDirect(fbb, &operands,
-                                                             &programs),
+            ::tt::target::metal::CreateEnqueueProgramCommandDirect(
+                fbb, &operands, program),
             op);
       } else if (auto allocOp = dyn_cast_or_null<tt::ttmetal::AllocOp>(op);
                  allocOp) {
         cqBuilder.appendCommand(
-            ::tt::target::metal::CreateHostAllocCommand(
+            ::tt::target::metal::CreateCreateBufferCommand(
                 fbb,
                 cache.getOrCreate(allocOp.getResult(), tensorValueToFlatbuffer,
                                   allocOp.getAddress(), allocOp.getSize())),
@@ -178,7 +175,7 @@ class TTMetalSerializeToBinary
       } else if (auto deallocOp = dyn_cast_or_null<tt::ttmetal::DeallocOp>(op);
                  deallocOp) {
         cqBuilder.appendCommand(
-            ::tt::target::metal::CreateHostDeallocCommand(
+            ::tt::target::metal::CreateDeallocateBufferCommand(
                 fbb, cache.at<::tt::target::TensorRef>(
                          getOperandThroughDPSOps(deallocOp.getInput()))),
             op);
@@ -186,7 +183,7 @@ class TTMetalSerializeToBinary
                      dyn_cast_or_null<tt::ttmetal::HostReadOp>(op);
                  hostReadOp) {
         cqBuilder.appendCommand(
-            ::tt::target::metal::CreateHostReadCommand(
+            ::tt::target::metal::CreateEnqueueReadBufferCommand(
                 fbb,
                 cache.at<::tt::target::TensorRef>(
                     getOperandThroughDPSOps(hostReadOp.getInput())),
@@ -197,7 +194,7 @@ class TTMetalSerializeToBinary
                      dyn_cast_or_null<tt::ttmetal::HostWriteOp>(op);
                  hostWriteOp) {
         cqBuilder.appendCommand(
-            ::tt::target::metal::CreateHostReadCommand(
+            ::tt::target::metal::CreateEnqueueWriteBufferCommand(
                 fbb,
                 cache.at<::tt::target::TensorRef>(
                     getOperandThroughDPSOps(hostWriteOp.getInput())),
@@ -219,13 +216,25 @@ class TTMetalSerializeToBinary
 
     std::vector<::flatbuffers::Offset<::tt::target::metal::CommandQueue>>
         commandQueues = {
-            ::tt::target::metal::CreateCommandQueueDirect(
+            ::tt::target::metal::CreateCommandQueueDirect(fbb, cqBuilder.name,
+                                                          &cqBuilder.commands),
+        };
+
+    std::vector<::flatbuffers::Offset<::tt::target::metal::DeviceProgram>>
+        devicePrograms = {
+            ::tt::target::metal::CreateDeviceProgramDirect(fbb, &commandQueues),
+        };
+
+    std::vector<::flatbuffers::Offset<::tt::target::metal::Program>> programs =
+        {
+            ::tt::target::metal::CreateProgramDirect(
                 fbb, cqBuilder.name, &cqBuilder.inputs, &cqBuilder.outputs,
-                &cqBuilder.commands),
+                &devicePrograms),
         };
+
     auto binary = ::tt::target::metal::CreateTTMetalBinaryDirect(
         fbb, &binaryVersion, ::ttmlir::getGitHash(),
-        toFlatbuffer(cache, systemDesc), &commandQueues);
+        toFlatbuffer(cache, systemDesc), &programs);
 
     FinishSizePrefixedTTMetalBinaryBuffer(fbb, binary);
     ::flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
diff --git a/lib/Dialect/TTNN/IR/TTNNOps.cpp b/lib/Dialect/TTNN/IR/TTNNOps.cpp
index 89b0e68e61..bdbf4a9de0 100644
--- a/lib/Dialect/TTNN/IR/TTNNOps.cpp
+++ b/lib/Dialect/TTNN/IR/TTNNOps.cpp
@@ -18,9 +18,9 @@ ::mlir::LogicalResult mlir::tt::ttnn::ToMemoryConfigOp::verify() {
   ::mlir::RankedTensorType inputTy = getInput().getType();
   ::mlir::RankedTensorType outputTy = getOutput().getType();
   auto inputLayout =
-      inputTy.getEncoding().template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(inputTy.getEncoding());
   auto outputLayout =
-      outputTy.getEncoding().template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+      mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(outputTy.getEncoding());
   if (not inputLayout) {
     return emitOpError("Input tensor type missing layout attribute");
   }
@@ -88,10 +88,8 @@ ::mlir::LogicalResult mlir::tt::ttnn::MatmulOp::verify() {
 // ANCHOR_END: adding_an_op_matmul_ttnn_verify
 
 ::mlir::LogicalResult AllocOp::verify() {
-  auto layout = getResult()
-                    .getType()
-                    .getEncoding()
-                    .template dyn_cast_or_null<mlir::tt::LayoutAttr>();
+  auto layout = mlir::dyn_cast_or_null<mlir::tt::LayoutAttr>(
+      getResult().getType().getEncoding());
   if (not layout) {
     return emitOpError("Result type missing layout attribute");
   }
@@ -101,9 +99,8 @@ ::mlir::LogicalResult AllocOp::verify() {
   }
 
   auto memref = layout.getMemref();
-  auto memspace = memref.getMemorySpace()
-                      .template cast<mlir::tt::MemorySpaceAttr>()
-                      .getValue();
+  auto memspace =
+      mlir::cast<mlir::tt::MemorySpaceAttr>(memref.getMemorySpace()).getValue();
   if (memspace != getMemorySpace()) {
     return emitOpError(
         "Input tensor layout memory space must match alloc memory space");
diff --git a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
index e3af4827ce..2aec4092ff 100644
--- a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
+++ b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "mlir/Pass/PassManager.h"
+#include "ttmlir/Conversion/Passes.h"
 #include "ttmlir/Dialect/TTIR/Transforms/Passes.h"
 #include "ttmlir/Dialect/TTNN/Pipelines/Passes.h"
 #include "ttmlir/Dialect/TTNN/Transforms/Passes.h"
@@ -14,6 +15,8 @@ namespace mlir::tt::ttnn {
 
 void createTTIRToTTNNBackendPipeline(
     OpPassManager &pm, const TTIRToTTNNBackendPipelineOptions &options) {
+  pm.addPass(mlir::tt::ttir::createTTIRLoadSystemDesc());
+  pm.addPass(mlir::tt::ttir::createTTIRImplicitDevice());
   pm.addPass(mlir::tt::ttir::createTTIRLayout());
 
   if (options.gridSetPassEnabled) {
@@ -23,7 +26,7 @@ void createTTIRToTTNNBackendPipeline(
   }
 
   pm.addPass(createTTNNOpenDevice());
-  pm.addPass(createConvertTTIRToTTNN());
+  pm.addPass(createConvertTTIRToTTNNPass());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TTNN/Transforms/CMakeLists.txt b/lib/Dialect/TTNN/Transforms/CMakeLists.txt
index 7371bf1c45..88b4bff549 100644
--- a/lib/Dialect/TTNN/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTNN/Transforms/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_mlir_dialect_library(MLIRTTNNTransforms
         Passes.cpp
         TTNNToCpp.cpp
-        TTNNToSerializedBinary.cpp
 
         ADDITIONAL_HEADER_DIRS
         ${PROJECT_SOURCE_DIR}/include/ttmlir
@@ -13,4 +12,3 @@ add_mlir_dialect_library(MLIRTTNNTransforms
         )
 
 target_include_directories(MLIRTTNNTransforms PUBLIC ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common)
-target_link_libraries(MLIRTTNNTransforms PRIVATE TTMLIRTTNNToEmitC)
diff --git a/lib/Dialect/TTNN/Transforms/Passes.cpp b/lib/Dialect/TTNN/Transforms/Passes.cpp
index a9ef422cc0..e5f4662fe1 100644
--- a/lib/Dialect/TTNN/Transforms/Passes.cpp
+++ b/lib/Dialect/TTNN/Transforms/Passes.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "ttmlir/Conversion/TTIRToTTNN/TTIRToTTNN.h"
 #include "ttmlir/Dialect/TT/IR/TT.h"
 #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
 #include "ttmlir/Dialect/TTIR/IR/TTIR.h"
@@ -24,17 +25,6 @@ namespace mlir::tt::ttnn {
 #define GEN_PASS_DEF_CONVERTTTIRTOTTNN
 #include "ttmlir/Dialect/TTNN/Transforms/Passes.h.inc"
 
-static Value findDevice(Operation *op) {
-  Block *block = op->getBlock();
-  for (auto &op : block->getOperations()) {
-    if (auto deviceOp = dyn_cast<OpenDeviceOp>(op)) {
-      return deviceOp.getResult();
-    }
-  }
-  assert(false && "No device found");
-  return nullptr;
-}
-
 class TTNNOpenDevice : public impl::TTNNOpenDeviceBase<TTNNOpenDevice> {
 public:
   using impl::TTNNOpenDeviceBase<TTNNOpenDevice>::TTNNOpenDeviceBase;
@@ -42,8 +32,8 @@ class TTNNOpenDevice : public impl::TTNNOpenDeviceBase<TTNNOpenDevice> {
   void runOnOperation() final {
     ModuleOp module = getOperation();
     OpBuilder builder(module);
-    auto systemDesc =
-        module->getAttr(tt::SystemDescAttr::name).cast<tt::SystemDescAttr>();
+    auto systemDesc = llvm::cast<tt::SystemDescAttr>(
+        module->getAttr(tt::SystemDescAttr::name));
     auto chipDescIndices = systemDesc.getChipDescIndices();
     assert(chipDescIndices.size() == 1 && "Multiple chips not supported yet");
 
@@ -57,7 +47,7 @@ class TTNNOpenDevice : public impl::TTNNOpenDeviceBase<TTNNOpenDevice> {
       builder.setInsertionPoint(block, opRange.begin());
       auto openDevice = builder.create<OpenDeviceOp>(
           func.getLoc(), builder.getType<tt::DeviceType>(
-                             builder.getAttr<tt::GridAttr>(), chipDescIndices));
+                             builder.getAttr<tt::DeviceAttr>(systemDesc)));
 
       builder.setInsertionPoint(block, opRange.end());
       builder.create<CloseDeviceOp>(func.getLoc(), openDevice.getResult());
@@ -69,112 +59,4 @@ class TTNNOpenDevice : public impl::TTNNOpenDeviceBase<TTNNOpenDevice> {
   }
 };
 
-class TTIRToTTNNLayoutRewriter : public OpRewritePattern<ttir::LayoutOp> {
-public:
-  using OpRewritePattern<ttir::LayoutOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ttir::LayoutOp op,
-                                PatternRewriter &rewriter) const final {
-    rewriter.replaceOpWithNewOp<ToMemoryConfigOp>(
-        op, op->getResultTypes(), op.getInput(), op.getOutput());
-    return success();
-  }
-};
-
-template <typename TTIROp, typename TTNNOp>
-class TTIRToTTNNOpRewriter : public OpRewritePattern<TTIROp> {
-public:
-  using OpRewritePattern<TTIROp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TTIROp op,
-                                PatternRewriter &rewriter) const final {
-    rewriter.replaceOpWithNewOp<TTNNOp>(op, op.getResultTypes(), op.getInputs(),
-                                        op.getOutputs());
-    return success();
-  }
-};
-
-template <typename TTIROp, typename TTNNOp>
-class TTIRToTTNNReductionOpRewriter : public OpRewritePattern<TTIROp> {
-  using OpRewritePattern<TTIROp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TTIROp op,
-                                PatternRewriter &rewriter) const final {
-    rewriter.replaceOpWithNewOp<TTNNOp>(
-        op, op.getResult().getType(), op.getInput(), op.getOutput(),
-        op.getKeepDim(), op.getDimArg().value_or(nullptr));
-    return success();
-  }
-};
-
-class TTIRToTTNNSoftmaxOpRewriter : public OpRewritePattern<ttir::SoftmaxOp> {
-  using OpRewritePattern<ttir::SoftmaxOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ttir::SoftmaxOp op,
-                                PatternRewriter &rewriter) const final {
-    rewriter.replaceOpWithNewOp<SoftmaxOp>(op, op.getResult().getType(),
-                                           op.getInput(), op.getOutput(),
-                                           op.getDimension());
-    return success();
-  }
-};
-
-// ANCHOR: adding_an_op_matmul_op_rewriter
-template <typename TTIROp, typename TTNNOp>
-class TTIRToTTNNBinaryOpRewriter : public OpRewritePattern<TTIROp> {
-public:
-  using OpRewritePattern<TTIROp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TTIROp op,
-                                PatternRewriter &rewriter) const final {
-    rewriter.replaceOpWithNewOp<TTNNOp>(op, op.getResult().getType(), op.getA(),
-                                        op.getB(), op.getOutput());
-    return success();
-  }
-};
-// ANCHOR_END: adding_an_op_matmul_op_rewriter
-
-class TensorEmptyToFullRewriter : public OpRewritePattern<tensor::EmptyOp> {
-public:
-  using OpRewritePattern<tensor::EmptyOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::EmptyOp op,
-                                PatternRewriter &rewriter) const final {
-    auto device = findDevice(op);
-    rewriter.replaceOpWithNewOp<FullOp>(op, op.getType(), device,
-                                        rewriter.getF32FloatAttr(0.0));
-    return success();
-  }
-};
-
-class ConvertTTIRToTTNN
-    : public impl::ConvertTTIRToTTNNBase<ConvertTTIRToTTNN> {
-public:
-  using impl::ConvertTTIRToTTNNBase<ConvertTTIRToTTNN>::ConvertTTIRToTTNNBase;
-
-  void runOnOperation() final {
-    // ANCHOR: adding_an_op_matmul_rewrite_pattern_set
-    RewritePatternSet patterns(&getContext());
-    patterns
-        .add<TTIRToTTNNLayoutRewriter, TTIRToTTNNOpRewriter<ttir::AddOp, AddOp>,
-             TTIRToTTNNOpRewriter<ttir::MultiplyOp, MultiplyOp>,
-             TTIRToTTNNOpRewriter<ttir::SubtractOp, SubtractOp>,
-             TTIRToTTNNOpRewriter<ttir::GreaterEqualOp, GreaterEqualOp>,
-             TTIRToTTNNOpRewriter<ttir::ReluOp, ReluOp>,
-             TTIRToTTNNBinaryOpRewriter<ttir::MatmulOp, MatmulOp>,
-             TTIRToTTNNReductionOpRewriter<ttir::SumOp, SumOp>,
-             TTIRToTTNNSoftmaxOpRewriter, TensorEmptyToFullRewriter>(
-            &getContext());
-    // ANCHOR_END: adding_an_op_matmul_rewrite_pattern_set
-    FrozenRewritePatternSet patternSet(std::move(patterns));
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) {
-      signalPassFailure();
-    }
-  }
-
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<mlir::tt::ttir::TTIRDialect>();
-    registry.insert<mlir::tt::ttnn::TTNNDialect>();
-  }
-};
 } // namespace mlir::tt::ttnn
diff --git a/lib/Dialect/TTNN/Transforms/TTNNToCpp.cpp b/lib/Dialect/TTNN/Transforms/TTNNToCpp.cpp
index da56cbb1bc..cbcb39ff8e 100644
--- a/lib/Dialect/TTNN/Transforms/TTNNToCpp.cpp
+++ b/lib/Dialect/TTNN/Transforms/TTNNToCpp.cpp
@@ -20,7 +20,7 @@ LogicalResult emitTTNNAsCpp(ModuleOp origOp, llvm::raw_ostream &os) {
   auto cleanupDispatchClone = llvm::make_scope_exit([&op] { op->erase(); });
 
   auto pm = PassManager::on<ModuleOp>(op.getContext());
-  pm.addNestedPass<func::FuncOp>(createConvertTTNNToEmitCPass());
+  pm.addPass(createConvertTTNNToEmitCPass());
 
   if (pm.run(op).failed()) {
     return failure();
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
new file mode 100644
index 0000000000..9c34667d09
--- /dev/null
+++ b/lib/Target/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(TTNN)
diff --git a/lib/Target/TTNN/CMakeLists.txt b/lib/Target/TTNN/CMakeLists.txt
new file mode 100644
index 0000000000..cdf5ab7368
--- /dev/null
+++ b/lib/Target/TTNN/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_mlir_translation_library(TTNNTargetFlatbuffer
+    TTNNToFlatbuffer.cpp
+    TTNNToFlatbufferRegistration.cpp
+
+    ADDITIONAL_HEADER_DIRS
+    ${PROJECT_SOURCE_DIR}/include/Target/TTNN
+
+    LINK_LIBS PUBLIC
+    MLIRTTNNDialect
+    MLIRTTIRDialect
+    MLIRTTDialect
+    MLIRTTKernelDialect
+)
+
+target_include_directories(TTNNTargetFlatbuffer PUBLIC ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common)
diff --git a/lib/Dialect/TTNN/Transforms/TTNNToSerializedBinary.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
similarity index 72%
rename from lib/Dialect/TTNN/Transforms/TTNNToSerializedBinary.cpp
rename to lib/Target/TTNN/TTNNToFlatbuffer.cpp
index 3cc4a68959..ab95981110 100644
--- a/lib/Dialect/TTNN/Transforms/TTNNToSerializedBinary.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <fstream>
+#include <llvm/Support/Casting.h>
 
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -14,11 +15,13 @@
 #include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
 #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
 #include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
 #include "ttmlir/Dialect/TTNN/IR/TTNNOpsTypes.h"
 #include "ttmlir/Dialect/TTNN/Transforms/Passes.h"
 #include "ttmlir/Dialect/TTNN/Transforms/TTNNToCpp.h"
-#include "ttmlir/Dialect/TTNN/Transforms/TTNNToSerializedBinary.h"
+#include "ttmlir/Target/TTNN/TTNNToFlatbuffer.h"
 #include "ttmlir/Target/TTNN/Target.h"
+#include "ttmlir/Target/TTNN/binary_generated.h"
 #include "ttmlir/Target/Utils/FlatbufferObjectCache.h"
 #include "ttmlir/Target/Utils/FuncOpToProgram.h"
 #include "ttmlir/Target/Utils/MLIRToFlatbuffer.h"
@@ -53,11 +56,12 @@ createOperation(FlatbufferObjectCache &cache, ::flatbuffers::Offset<OpT> op,
 ::flatbuffers::Offset<::tt::target::ttnn::OpenDeviceOp>
 createOp(FlatbufferObjectCache &cache, OpenDeviceOp op) {
   auto result = op.getResult();
-  auto resultType = result.getType().cast<DeviceType>();
-  ::tt::target::Dim2d mesh = toFlatbuffer(cache, resultType.getMesh());
-  auto chipIds = toFlatbuffer(cache, resultType.getChipIds());
+  auto resultType = mlir::cast<DeviceType>(result.getType());
+  ::tt::target::Dim2d grid =
+      toFlatbuffer(cache, resultType.getDesc().getGrid());
+  auto chipIds = toFlatbuffer(cache, resultType.getDesc().getChipIds());
   auto out = cache.getOrCreate(result, createDeviceRef);
-  return ::tt::target::ttnn::CreateOpenDeviceOp(*cache.fbb, &mesh, chipIds,
+  return ::tt::target::ttnn::CreateOpenDeviceOp(*cache.fbb, &grid, chipIds,
                                                 out);
 }
 
@@ -138,6 +142,8 @@ createReductionOp(FlatbufferObjectCache &cache, ReductionOp op) {
   ::tt::target::ttnn::ReductionOpType type;
   if constexpr (std::is_same_v<ReductionOp, SumOp>) {
     type = ::tt::target::ttnn::ReductionOpType::Sum;
+  } else if constexpr (std::is_same_v<ReductionOp, MeanOp>) {
+    type = ::tt::target::ttnn::ReductionOpType::Mean;
   } else {
     llvm_unreachable("unhandled ReductionOp");
   }
@@ -205,6 +211,10 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
   if (auto sumOp = dyn_cast<SumOp>(op); sumOp) {
     return createOperation(cache, createReductionOp(cache, sumOp), debugString);
   }
+  if (auto meanOp = dyn_cast<MeanOp>(op); meanOp) {
+    return createOperation(cache, createReductionOp(cache, meanOp),
+                           debugString);
+  }
   if (auto softmaxOp = dyn_cast<SoftmaxOp>(op); softmaxOp) {
     return createOperation(cache, createSoftmaxOp(cache, softmaxOp),
                            debugString);
@@ -213,95 +223,60 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
   llvm_unreachable("unhandled op in emitTTNNOperation");
 }
 
-class TTNNSerializeToBinary
-    : public impl::TTNNSerializeToBinaryBase<TTNNSerializeToBinary> {
-public:
-  using impl::TTNNSerializeToBinaryBase<
-      TTNNSerializeToBinary>::TTNNSerializeToBinaryBase;
+std::shared_ptr<void> ttnnToFlatbuffer(Operation *op) {
+  ModuleOp module = dyn_cast<ModuleOp>(op);
+  assert(module && "Expected ModuleOp as top level operation");
 
-  void runOnOperation() final {
-    ::flatbuffers::FlatBufferBuilder fbb;
-    FlatbufferObjectCache cache(&fbb);
+  ::flatbuffers::FlatBufferBuilder fbb;
+  FlatbufferObjectCache cache(&fbb);
 
-    ::ttmlir::Version ttmlirVersion = ::ttmlir::getVersion();
-    ::tt::target::Version binaryVersion(
-        ttmlirVersion.major, ttmlirVersion.minor, ttmlirVersion.patch);
+  ::ttmlir::Version ttmlirVersion = ::ttmlir::getVersion();
+  ::tt::target::Version binaryVersion(ttmlirVersion.major, ttmlirVersion.minor,
+                                      ttmlirVersion.patch);
 
-    ModuleOp module = getOperation();
-    auto systemDesc = toFlatbuffer(
-        cache,
-        module->getAttr(tt::SystemDescAttr::name).cast<tt::SystemDescAttr>());
+  auto systemDesc =
+      toFlatbuffer(cache, mlir::cast<tt::SystemDescAttr>(
+                              module->getAttr(tt::SystemDescAttr::name)));
 
-    func::FuncOp entry = dyn_cast<func::FuncOp>(*module.getRegion().op_begin());
-    assert(entry && "expected an entry function");
-    Program<::tt::target::ttnn::Operation> program =
-        funcOpToProgram<::tt::target::ttnn::Operation>(cache, entry,
-                                                       emitTTNNOperation);
+  auto mlir = toDebugInfo(fbb, "ttnn", module);
+  std::string cpp;
+  llvm::raw_string_ostream os(cpp);
+  auto result = mlir::tt::ttnn::emitTTNNAsCpp(module, os);
+  (void)result;
 
-    auto mlir = toDebugInfo(fbb, "ttnn", module);
-    std::string cpp;
-    llvm::raw_string_ostream os(cpp);
-    auto result = mlir::tt::ttnn::emitTTNNAsCpp(module, os);
-    (void)result;
+  auto debugInfo = ::tt::target::CreateDebugInfoDirect(fbb, mlir, cpp.c_str());
 
-    auto debugInfo =
-        ::tt::target::CreateDebugInfoDirect(fbb, mlir, cpp.c_str());
-    auto programOffset = ::tt::target::ttnn::CreateProgramDirect(
+  std::vector<::flatbuffers::Offset<::tt::target::ttnn::Program>> programs;
+  module->walk([&](func::FuncOp func) {
+    Program<::tt::target::ttnn::Operation> program =
+        funcOpToProgram<::tt::target::ttnn::Operation>(cache, func,
+                                                       emitTTNNOperation);
+    programs.push_back(::tt::target::ttnn::CreateProgramDirect(
         fbb, program.name, &program.inputs, &program.outputs, &program.ops,
-        debugInfo);
-    std::vector<::flatbuffers::Offset<::tt::target::ttnn::Program>> programs = {
-        programOffset,
-    };
-    auto binary = ::tt::target::ttnn::CreateTTNNBinaryDirect(
-        fbb, &binaryVersion, ::ttmlir::getGitHash(), systemDesc, &programs);
-
-    ::tt::target::ttnn::FinishSizePrefixedTTNNBinaryBuffer(fbb, binary);
-    ::flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
-    ::tt::target::ttnn::VerifySizePrefixedTTNNBinaryBuffer(verifier);
-
-    uint8_t *buf = fbb.GetBufferPointer();
-    auto size = fbb.GetSize();
-
-    serializedBinary = std::shared_ptr<void>(std::malloc(size), std::free);
-    std::memcpy(serializedBinary.get(), buf, size);
-
-    if (not output.empty()) {
-      std::ofstream ttnn(output, std::ios::out | std::ios::binary);
-      ttnn.write(reinterpret_cast<char const *>(buf), size);
-      ttnn.close();
-    }
-  }
+        debugInfo));
+  });
 
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<mlir::tt::ttnn::TTNNDialect>();
-    registry.insert<mlir::tt::ttkernel::TTKernelDialect>();
-    registry.insert<mlir::func::FuncDialect>();
-    registry.insert<mlir::emitc::EmitCDialect>();
-  }
-
-  std::shared_ptr<void> getBinary() const { return serializedBinary; }
+  auto binary = ::tt::target::ttnn::CreateTTNNBinaryDirect(
+      fbb, &binaryVersion, ::ttmlir::getGitHash(), systemDesc, &programs);
 
-private:
-  std::shared_ptr<void> serializedBinary;
-};
+  ::tt::target::ttnn::FinishSizePrefixedTTNNBinaryBuffer(fbb, binary);
+  ::flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
+  ::tt::target::ttnn::VerifySizePrefixedTTNNBinaryBuffer(verifier);
 
-std::shared_ptr<void> emitTTNNAsFlatbuffer(OwningOpRef<ModuleOp> &moduleOp) {
-  auto pm = PassManager::on<ModuleOp>(moduleOp.get().getContext());
-  auto pass = createTTNNSerializeToBinary();
-  Pass *basePass = pass.get();
-  pm.addPass(std::move(pass));
-
-  // Run the pass manager.
-  if (failed(pm.run(moduleOp.get()))) {
-    return nullptr;
-  }
+  uint8_t *buf = fbb.GetBufferPointer();
+  std::size_t size = fbb.GetSize();
 
-  auto *derivedPass = llvm::dyn_cast<TTNNSerializeToBinary>(basePass);
-  if (!derivedPass) {
-    return nullptr;
-  }
-
-  return derivedPass->getBinary();
+  std::shared_ptr<void> bufferPtr =
+      std::shared_ptr<void>(std::malloc(size), std::free);
+  std::memcpy(bufferPtr.get(), buf, size);
+  return bufferPtr;
 }
 
+LogicalResult translateTTNNToFlatbuffer(Operation *op, llvm::raw_ostream &os) {
+  std::shared_ptr<void> data = ttnnToFlatbuffer(op);
+  std::size_t size = ::flatbuffers::GetSizePrefixedBufferLength(
+      static_cast<const uint8_t *>(data.get()));
+  os.write(reinterpret_cast<char const *>(data.get()), size);
+  return success();
+}
 } // namespace mlir::tt::ttnn
diff --git a/lib/Target/TTNN/TTNNToFlatbufferRegistration.cpp b/lib/Target/TTNN/TTNNToFlatbufferRegistration.cpp
new file mode 100644
index 0000000000..f572728ff8
--- /dev/null
+++ b/lib/Target/TTNN/TTNNToFlatbufferRegistration.cpp
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Tools/mlir-translate/Translation.h"
+
+#include "ttmlir/Dialect/TT/IR/TT.h"
+#include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNN.h"
+#include "ttmlir/Target/TTNN/TTNNToFlatbuffer.h"
+
+using namespace mlir;
+
+namespace mlir::tt::ttnn {
+
+void registerTTNNToFlatbuffer() {
+  TranslateFromMLIRRegistration reg(
+      "ttnn-to-flatbuffer", "translate ttnn to flatbuffer",
+      translateTTNNToFlatbuffer /* function */, [](DialectRegistry &registry) {
+        // clang-format off
+        registry.insert<mlir::tt::TTDialect,
+                        mlir::tt::ttnn::TTNNDialect,
+                        mlir::tt::ttkernel::TTKernelDialect,
+                        mlir::func::FuncDialect,
+                        mlir::emitc::EmitCDialect
+                        >();
+        // clang-format on
+      });
+}
+
+} // namespace mlir::tt::ttnn
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 5d0c9aeb15..999a2e5d05 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,6 +18,8 @@ declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT TTMLIRPythonSources.Dialects
   ROOT_DIR "${TTMLIR_PYTHON_ROOT_DIR}"
   TD_FILE dialects/TTBinding.td
+  GEN_ENUM_BINDINGS ON
+  GEN_ENUM_TD_FILE dialects/TTEnumBindings.td
   SOURCES dialects/tt.py
   DIALECT_NAME tt
 )
@@ -47,6 +49,7 @@ declare_mlir_python_extension(TTMLIRPythonExtensions.Main
     TTKernelModule.cpp
     Overrides.cpp
   EMBED_CAPI_LINK_LIBS
+    MLIRCAPITransforms
     TTMLIRCAPI
   PRIVATE_LINK_LIBS
     LLVMSupport
diff --git a/python/TTKernelModule.cpp b/python/TTKernelModule.cpp
index e57d244163..f6e43215d4 100644
--- a/python/TTKernelModule.cpp
+++ b/python/TTKernelModule.cpp
@@ -20,9 +20,10 @@ void populateTTKernelModule(py::module &m) {
                     return ttmlirTTKernelCBTypeGet(ctx, address, port,
                                                    memrefType);
                   })
-      .def_static(
-          "cast",
-          [](MlirType &ty) { return unwrap(ty).cast<tt::ttkernel::CBType>(); })
+      .def_static("cast",
+                  [](MlirType &ty) {
+                    return mlir::cast<tt::ttkernel::CBType>(unwrap(ty));
+                  })
       .def_property_readonly("shape",
                              [](tt::ttkernel::CBType &cb) {
                                return std::vector<int64_t>(cb.getShape());
diff --git a/python/TTModule.cpp b/python/TTModule.cpp
index 69fc4e67b1..b34ba8f9eb 100644
--- a/python/TTModule.cpp
+++ b/python/TTModule.cpp
@@ -6,134 +6,72 @@
 
 #include "ttmlir/Bindings/Python/TTMLIRModule.h"
 
+#include "mlir/CAPI/AffineMap.h"
 #include "mlir/CAPI/IR.h"
 
 #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
 
 namespace mlir::ttmlir::python {
 void populateTTModule(py::module &m) {
-  py::enum_<tt::DataType>(m, "DataType")
-      .value("Float32", tt::DataType::Float32)
-      .value("Float16", tt::DataType::Float16)
-      .value("BFloat16", tt::DataType::BFloat16)
-      .value("BFP_Float8", tt::DataType::BFP_Float8)
-      .value("BFP_BFloat8", tt::DataType::BFP_BFloat8)
-      .value("BFP_Float4", tt::DataType::BFP_Float4)
-      .value("BFP_BFloat4", tt::DataType::BFP_BFloat4)
-      .value("BFP_Float2", tt::DataType::BFP_Float2)
-      .value("BFP_BFloat2", tt::DataType::BFP_BFloat2)
-      .value("UInt32", tt::DataType::UInt32)
-      .value("UInt16", tt::DataType::UInt16)
-      .value("UInt8", tt::DataType::UInt8);
-
-  py::enum_<tt::MemorySpace>(m, "MemorySpace")
-      .value("System", tt::MemorySpace::System)
-      .value("SystemMMIO", tt::MemorySpace::SystemMMIO)
-      .value("DeviceDRAM", tt::MemorySpace::DeviceDRAM)
-      .value("DeviceL1", tt::MemorySpace::DeviceL1)
-      .value("MIN", tt::MemorySpace::System)
-      .value(
-          "MAX",
-          tt::MemorySpace::DeviceL1); // Referenced from Generated MemorySpace
-
-  py::enum_<tt::OOBVal>(m, "OOBVal")
-      .value("Zero", tt::OOBVal::Zero)
-      .value("One", tt::OOBVal::One)
-      .value("Undef", tt::OOBVal::Undef)
-      .value("Inf", tt::OOBVal::Inf)
-      .value("NegInf", tt::OOBVal::NegInf)
-      .value("MIN", tt::OOBVal::Undef)
-      .value("MAX",
-             tt::OOBVal::NegInf); // Referenced from types_generated.h as well
-
-  py::class_<llvm::ArrayRef<int64_t>>(m, "ArrayRefInt64")
-      .def(py::init<const int64_t *, size_t>())
-      .def("__getitem__",
-           [](const llvm::ArrayRef<int64_t> &arr, size_t index) {
-             if (index >= arr.size()) {
-               throw py::index_error("Index out of range");
-             }
-             return arr[index];
-           })
-      .def("__len__", &llvm::ArrayRef<int64_t>::size);
-
-  py::class_<tt::GridAttr>(m, "GridAttr")
-      .def_static("get",
-                  [](MlirContext ctx, std::vector<int64_t> shape) {
-                    return wrap(tt::GridAttr::get(unwrap(ctx), shape));
-                  })
-      .def_property_readonly("name",
-                             [](tt::GridAttr const &ga) { return ga.name; })
-      .def_property_readonly("shape", &tt::GridAttr::getShape)
-      .def_property_readonly("physical_grid_mapping",
-                             &tt::GridAttr::getPhysicalGridMapping);
-
-  py::class_<MemRefType>(m, "MemRefType")
-      .def_property_readonly("shape", &MemRefType::getShape)
-      .def_property_readonly(
-          "element_type",
-          &MemRefType::getElementType) // mlir::Type not supported
-      .def_property_readonly("name",
-                             [](MemRefType const &mt) {
-                               return mt.name;
-                             }) // llvm::StringLiteral not supported
-      .def_property_readonly(
-          "memory_space",
-          &MemRefType::getMemorySpace); // mlir::Attribute not supported
-
   py::class_<tt::LayoutAttr>(m, "LayoutAttr")
       .def_static("get",
                   [](MlirContext ctx, MlirType rankedTensorType,
-                     tt::MemorySpace memorySpace, MlirAttribute grid,
+                     uint32_t memorySpaceValue, MlirAttribute grid,
                      std::vector<std::pair<std::int64_t, std::int64_t>>
                          collapseIntervals,
-                     tt::OOBVal oobVal) {
+                     uint32_t oobValValue) {
                     return wrap(tt::LayoutAttr::get(
                         unwrap(ctx),
-                        unwrap(rankedTensorType).cast<RankedTensorType>(),
-                        memorySpace, unwrap(grid).cast<tt::GridAttr>(),
-                        collapseIntervals, oobVal));
+                        mlir::cast<RankedTensorType>(unwrap(rankedTensorType)),
+                        static_cast<tt::MemorySpace>(memorySpaceValue),
+                        mlir::cast<tt::GridAttr>(unwrap(grid)),
+                        collapseIntervals,
+                        static_cast<tt::OOBVal>(oobValValue)));
                   })
       .def_static("with_grid",
                   [](MlirContext ctx, MlirAttribute self,
                      std::vector<std::int64_t> tensorShape, MlirAttribute grid,
                      std::vector<std::pair<std::int64_t, std::int64_t>>
                          collapseIntervals) {
-                    return wrap(unwrap(self).cast<tt::LayoutAttr>().withGrid(
-                        unwrap(ctx), tensorShape,
-                        unwrap(grid).cast<tt::GridAttr>(), collapseIntervals));
+                    return wrap(
+                        mlir::cast<tt::LayoutAttr>(unwrap(self))
+                            .withGrid(unwrap(ctx), tensorShape,
+                                      mlir::cast<tt::GridAttr>(unwrap(grid)),
+                                      collapseIntervals));
                   })
       .def_static("with_grid_",
                   [](MlirContext ctx, MlirAttribute self,
                      std::vector<std::int64_t> tensorShape, MlirAttribute grid,
                      std::vector<std::pair<std::int64_t, std::int64_t>>
                          collapseIntervals) {
-                    return unwrap(self).cast<tt::LayoutAttr>().withGrid(
-                        unwrap(ctx), tensorShape,
-                        unwrap(grid).cast<tt::GridAttr>(), collapseIntervals);
+                    return mlir::cast<tt::LayoutAttr>(unwrap(self))
+                        .withGrid(unwrap(ctx), tensorShape,
+                                  mlir::cast<tt::GridAttr>(unwrap(grid)),
+                                  collapseIntervals);
                   })
       .def_static(
           "with_element_type",
           [](MlirContext ctx, MlirAttribute self, MlirType elementType) {
-            return wrap(unwrap(self).cast<tt::LayoutAttr>().withElementType(
-                unwrap(ctx), unwrap(elementType)));
+            return wrap(mlir::cast<tt::LayoutAttr>(unwrap(self))
+                            .withElementType(unwrap(ctx), unwrap(elementType)));
           })
       .def_static(
           "with_element_type_",
           [](MlirContext ctx, MlirAttribute self, MlirType elementType) {
-            return unwrap(self).cast<tt::LayoutAttr>().withElementType(
-                unwrap(ctx), unwrap(elementType));
+            return mlir::cast<tt::LayoutAttr>(unwrap(self))
+                .withElementType(unwrap(ctx), unwrap(elementType));
           })
       .def("getLayout",
            [](MlirType &type) {
              assert(isa<RankedTensorType>(
                  unwrap(type))); // Make sure that this is operating on a
                                  // RankedTensorType object
-             RankedTensorType tensor = unwrap(type).cast<RankedTensorType>();
+             RankedTensorType tensor =
+                 mlir::cast<RankedTensorType>(unwrap(type));
              assert(tensor.getEncoding()); // Make sure that this Tensor has an
                                            // encoding value
              tt::LayoutAttr layout =
-                 tensor.getEncoding().template cast<tt::LayoutAttr>();
+                 mlir::cast<tt::LayoutAttr>(tensor.getEncoding());
              return layout;
            })
       .def("wrapped", [](tt::LayoutAttr const &self) { return wrap(self); })
@@ -149,16 +87,138 @@ void populateTTModule(py::module &m) {
       .def_property_readonly("memory_space", &tt::LayoutAttr::getMemorySpace)
       .def_property_readonly("shard_shape", &tt::LayoutAttr::getShardShape);
 
+  py::class_<tt::GridAttr>(m, "GridAttr")
+      .def_static("get",
+                  [](MlirContext ctx, std::vector<int64_t> shape) {
+                    return wrap(tt::GridAttr::get(unwrap(ctx), shape));
+                  })
+      .def_property_readonly("shape", [](tt::GridAttr const &ga) {
+        return std::vector<int64_t>(ga.getShape().begin(), ga.getShape().end());
+      });
+
+  py::class_<tt::ChipCapabilityAttr>(m, "ChipCapabilityAttr")
+      .def_static("get", [](MlirContext ctx, uint32_t chipCapability) {
+        return wrap(tt::ChipCapabilityAttr::get(
+            unwrap(ctx), static_cast<tt::ChipCapability>(chipCapability)));
+      });
+
+  py::class_<tt::ArchAttr>(m, "ArchAttr")
+      .def_static("get", [](MlirContext ctx, uint32_t arch) {
+        return wrap(
+            tt::ArchAttr::get(unwrap(ctx), static_cast<tt::Arch>(arch)));
+      });
+
+  py::class_<tt::ChipDescAttr>(m, "ChipDescAttr")
+      .def_static("get", [](MlirContext ctx, MlirAttribute arch,
+                            std::vector<int64_t> grid, unsigned l1Size,
+                            unsigned numDramChannels, unsigned dramChannelSize,
+                            unsigned nocL1AddressAlignBytes,
+                            unsigned pcieAddressAlignBytes,
+                            unsigned nocDRAMAddressAlignBytes) {
+        return wrap(tt::ChipDescAttr::get(
+            unwrap(ctx), mlir::cast<tt::ArchAttr>(unwrap(arch)), grid, l1Size,
+            numDramChannels, dramChannelSize, nocL1AddressAlignBytes,
+            pcieAddressAlignBytes, nocDRAMAddressAlignBytes));
+      });
+
+  py::class_<tt::ChipCoordAttr>(m, "ChipCoordAttr")
+      .def_static("get", [](MlirContext ctx, unsigned rack, unsigned shelf,
+                            unsigned y, unsigned x) {
+        return wrap(tt::ChipCoordAttr::get(unwrap(ctx), rack, shelf, y, x));
+      });
+
+  py::class_<tt::ChipChannelAttr>(m, "ChipChannelAttr")
+      .def_static("get",
+                  [](MlirContext ctx, unsigned endpoint0, unsigned endpoint1) {
+                    return wrap(tt::ChipChannelAttr::get(unwrap(ctx), endpoint0,
+                                                         endpoint1));
+                  });
+
+  py::class_<tt::SystemDescAttr>(m, "SystemDescAttr")
+      .def_static("get", [](MlirContext ctx,
+                            std::vector<MlirAttribute> chipDescs,
+                            std::vector<unsigned> chipDescIndices,
+                            std::vector<MlirAttribute> chipCapabilities,
+                            std::vector<MlirAttribute> chipCoords,
+                            std::vector<MlirAttribute> chipChannels) {
+        std::vector<tt::ChipDescAttr> chipDescsUnwrapped;
+        for (auto chipDesc : chipDescs) {
+          chipDescsUnwrapped.push_back(
+              mlir::cast<tt::ChipDescAttr>(unwrap(chipDesc)));
+        }
+        std::vector<tt::ChipCapabilityAttr> chipCapabilitiesUnwrapped;
+        for (auto chipCapability : chipCapabilities) {
+          chipCapabilitiesUnwrapped.push_back(
+              mlir::cast<tt::ChipCapabilityAttr>(unwrap(chipCapability)));
+        }
+        std::vector<tt::ChipCoordAttr> chipCoordsUnwrapped;
+        for (auto chipCoord : chipCoords) {
+          chipCoordsUnwrapped.push_back(
+              mlir::cast<tt::ChipCoordAttr>(unwrap(chipCoord)));
+        }
+        std::vector<tt::ChipChannelAttr> chipChannelsUnwrapped;
+        for (auto chipChannel : chipChannels) {
+          chipChannelsUnwrapped.push_back(
+              mlir::cast<tt::ChipChannelAttr>(unwrap(chipChannel)));
+        }
+        return wrap(tt::SystemDescAttr::get(
+            unwrap(ctx), chipDescsUnwrapped, chipDescIndices,
+            chipCapabilitiesUnwrapped, chipCoordsUnwrapped,
+            chipChannelsUnwrapped));
+      });
+
+  py::class_<tt::MemorySpaceAttr>(m, "MemorySpaceAttr")
+      .def_static("get", [](MlirContext ctx, uint32_t memorySpace) {
+        return wrap(tt::MemorySpaceAttr::get(
+            unwrap(ctx), static_cast<tt::MemorySpace>(memorySpace)));
+      });
+
+  py::class_<tt::OOBValAttr>(m, "OOBValAttr")
+      .def_static("get", [](MlirContext ctx, uint32_t oobVal) {
+        return wrap(
+            tt::OOBValAttr::get(unwrap(ctx), static_cast<tt::OOBVal>(oobVal)));
+      });
+
+  py::class_<tt::IteratorTypeAttr>(m, "IteratorTypeAttr")
+      .def_static("get", [](MlirContext ctx, uint32_t iteratorType) {
+        return wrap(tt::IteratorTypeAttr::get(
+            unwrap(ctx), static_cast<tt::IteratorType>(iteratorType)));
+      });
+
+  py::class_<tt::OperandConstraintAttr>(m, "OperandConstraintAttr")
+      .def_static("get", [](uint32_t operandConstraint, MlirContext ctx) {
+        return wrap(tt::OperandConstraintAttr::get(
+            unwrap(ctx),
+            static_cast<tt::OperandConstraint>(operandConstraint)));
+      });
+
+  py::class_<tt::DeviceType>(m, "DeviceType")
+      .def_static("get", [](MlirContext ctx, MlirAttribute deviceAttr) {
+        return wrap(tt::DeviceType::get(
+            unwrap(ctx), mlir::cast<tt::DeviceAttr>(unwrap(deviceAttr))));
+      });
+
+  py::class_<tt::DeviceAttr>(m, "DeviceAttr")
+      .def_static(
+          "get",
+          [](MlirContext ctx, std::vector<int64_t> shape,
+             MlirAffineMap physicalGridMapping, std::vector<unsigned> chipIds) {
+            return wrap(tt::DeviceAttr::get(
+                unwrap(ctx), shape, unwrap(physicalGridMapping), chipIds));
+          })
+      .def("unwrap", [](MlirAttribute const &self) {
+        return mlir::cast<tt::DeviceAttr>(unwrap(self));
+      });
+
   py::class_<tt::TileType>(m, "TileType")
       .def_static("get",
-                  [](MlirContext ctx, uint64_t height, uint64_t width,
-                     tt::DataType dataType) {
-                    return wrap(tt::TileType::get(unwrap(ctx), height, width,
-                                                  dataType));
+                  [](MlirContext ctx, unsigned height, unsigned width,
+                     uint32_t dataType) {
+                    return wrap(
+                        tt::TileType::get(unwrap(ctx), height, width,
+                                          static_cast<tt::DataType>(dataType)));
                   })
-      .def_property_readonly(
-          "data_type",
-          [](tt::TileType const &tile) { return tile.getDataType(); })
+      .def_property_readonly("data_type", &tt::TileType::getDataType)
       .def_property_readonly("shape", [](tt::TileType const &tile) {
         return std::vector<int64_t>({tile.getHeight(), tile.getWidth()});
       });
diff --git a/python/ttmlir/dialects/TTEnumBindings.td b/python/ttmlir/dialects/TTEnumBindings.td
new file mode 100644
index 0000000000..929d651152
--- /dev/null
+++ b/python/ttmlir/dialects/TTEnumBindings.td
@@ -0,0 +1,10 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef PYTHON_BINDINGS_TTMLIR_TTENUMS
+#define PYTHON_BINDINGS_TTMLIR_TTENUMS
+
+include "ttmlir/Dialect/TT/IR/TTOpsEnums.td"
+
+#endif
diff --git a/python/ttmlir/dialects/tt.py b/python/ttmlir/dialects/tt.py
index d1ec6a1c3c..be7a0b70f2 100644
--- a/python/ttmlir/dialects/tt.py
+++ b/python/ttmlir/dialects/tt.py
@@ -3,4 +3,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from ._tt_ops_gen import *
+from ._tt_enum_gen import *
 from .._mlir_libs._ttmlir import tt_ir as ir, register_dialect
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index f50fac5d72..9ad1f547d1 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -1,5 +1,17 @@
 # Options
 option(TTMLIR_ENABLE_RUNTIME_TESTS "Enable runtime tests" OFF)
+option(TT_RUNTIME_ENABLE_TTNN "Enable TTNN Runtime" OFF)
+option(TT_RUNTIME_ENABLE_TTMETAL "Enable TTMetal Runtime" OFF)
+
+if (TT_RUNTIME_ENABLE_TTNN AND TT_RUNTIME_ENABLE_TTMETAL)
+  message(FATAL_ERROR "Cannot enable both TTNN and TTMETAL runtimes")
+endif()
+
+if (NOT TT_RUNTIME_ENABLE_TTNN AND NOT TT_RUNTIME_ENABLE_TTMETAL)
+  # Default to TTNN
+  set(TT_RUNTIME_ENABLE_TTNN ON)
+endif()
+
 
 add_subdirectory(lib)
 add_subdirectory(tools)
diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
new file mode 100644
index 0000000000..3345f73f1d
--- /dev/null
+++ b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TT_RUNTIME_DETAIL_TTMETAL_H
+#define TT_RUNTIME_DETAIL_TTMETAL_H
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+#pragma clang diagnostic ignored "-Wctad-maybe-unsupported"
+#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+#pragma clang diagnostic ignored "-Wignored-qualifiers"
+#pragma clang diagnostic ignored "-Wvla-extension"
+#pragma clang diagnostic ignored "-Wcovered-switch-default"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wc++20-extensions"
+#pragma clang diagnostic ignored "-Wc++20-designator"
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wsuggest-override"
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wnested-anon-types"
+#pragma clang diagnostic ignored "-Wreorder-ctor"
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#pragma clang diagnostic ignored "-Wunused-lambda-capture"
+#pragma clang diagnostic ignored "-Wmissing-field-initializers"
+#pragma clang diagnostic ignored "-Wunused-private-field"
+#pragma clang diagnostic ignored "-Wimplicit-fallthrough"
+#pragma clang diagnostic ignored "-Wstring-conversion"
+#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
+#pragma clang diagnostic ignored "-Wunused-local-typedef"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#pragma clang diagnostic ignored "-Wparentheses"
+#define FMT_HEADER_ONLY
+#include "tt_metal/host_api.hpp"
+#pragma clang diagnostic pop
+
+#include "tt/runtime/types.h"
+#include "ttmlir/Target/TTMetal/Target.h"
+
+namespace tt::runtime::ttmetal {
+
+std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc();
+
+Tensor createTensor(std::shared_ptr<void> data,
+                    std::vector<std::uint32_t> const &shape,
+                    std::vector<std::uint32_t> const &stride,
+                    std::uint32_t itemsize, ::tt::target::DataType dataType);
+
+inline Tensor createTensor(std::shared_ptr<void> data, TensorDesc const &desc) {
+  return createTensor(data, desc.shape, desc.stride, desc.itemsize,
+                      desc.dataType);
+}
+
+Device openDevice(std::vector<int> const &deviceIds = {0},
+                  std::vector<uint8_t> const &numHWCQs = {});
+
+void closeDevice(Device device);
+
+Event submit(Device device, Binary executable, std::uint32_t programIndex,
+             std::vector<Tensor> const &inputs,
+             std::vector<Tensor> const &outputs);
+
+void wait(Event event);
+
+std::shared_ptr<::tt::tt_metal::Event> executeCommandQueue(
+    ::tt::tt_metal::Device *device, ::tt::target::metal::CommandQueue const *cq,
+    std::size_t cq_id,
+    std::vector<
+        std::pair<std::uint32_t, std::shared_ptr<::tt::tt_metal::Buffer>>> const
+        &inputs,
+    std::vector<
+        std::pair<std::uint32_t, std::shared_ptr<::tt::tt_metal::Buffer>>> const
+        &outputs);
+
+} // namespace tt::runtime::ttmetal
+
+#endif
diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index 693f008fe0..a07ebf61d7 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -17,6 +17,7 @@
 #pragma clang diagnostic ignored "-Wc++20-designator"
 #pragma clang diagnostic ignored "-Wnon-virtual-dtor"
 #pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunknown-warning-option"
 #pragma clang diagnostic ignored "-Wsuggest-override"
 #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
 #pragma clang diagnostic ignored "-Wnested-anon-types"
@@ -32,13 +33,14 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wpessimizing-move"
 #define FMT_HEADER_ONLY
-#include "ttnn//operations/reduction/generic/generic_reductions.hpp"
 #include "ttnn/device.hpp"
 #include "ttnn/operations/binary.hpp"
+#include "ttnn/operations/copy.hpp"
 #include "ttnn/operations/core.hpp"
 #include "ttnn/operations/creation.hpp"
 #include "ttnn/operations/matmul.hpp"
 #include "ttnn/operations/normalization.hpp"
+#include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 #pragma clang diagnostic pop
 
 #include "tt/runtime/types.h"
@@ -58,7 +60,8 @@ inline Tensor createTensor(std::shared_ptr<void> data, TensorDesc const &desc) {
                       desc.dataType);
 }
 
-Device openDevice(std::vector<int> deviceIds = {0});
+Device openDevice(std::vector<int> const &deviceIds = {0},
+                  std::vector<std::uint8_t> const &numHWCQs = {});
 
 void closeDevice(Device device);
 
diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h
index 96ab455320..68a4b7e1db 100644
--- a/runtime/include/tt/runtime/runtime.h
+++ b/runtime/include/tt/runtime/runtime.h
@@ -25,7 +25,8 @@ inline Tensor createTensor(std::shared_ptr<void> data, TensorDesc const &desc) {
                       desc.dataType);
 }
 
-Device openDevice(std::vector<int> deviceIds = {0});
+Device openDevice(std::vector<int> const &deviceIds = {0},
+                  std::vector<std::uint8_t> const &numHWCQs = {});
 
 void closeDevice(Device device);
 
diff --git a/runtime/lib/CMakeLists.txt b/runtime/lib/CMakeLists.txt
index c306104221..53a9cfe787 100644
--- a/runtime/lib/CMakeLists.txt
+++ b/runtime/lib/CMakeLists.txt
@@ -1,30 +1,15 @@
-set(TT_RUNTIME_ENABLE_TTNN OFF)
-set(TT_RUNTIME_ENABLE_TTMETAL OFF)
+if (TT_RUNTIME_ENABLE_TTMETAL AND TTMLIR_ENABLE_RUNTIME_TESTS)
+  message(FATAL_ERROR "TTMETAL runtime does not support runtime tests yet")
+endif()
 
 if (TTMLIR_ENABLE_RUNTIME)
-  set(TT_RUNTIME_ENABLE_TTNN NOT TTMLIR_DISABLE_RUNTIME_TTNN)
-  set(TT_RUNTIME_ENABLE_TTMETAL OFF)
-
   if (TT_RUNTIME_ENABLE_TTNN)
-    add_library(TTRuntimeTTNN
-      STATIC
-      ttnn/runtime.cpp
-      ttnn/program.cpp
-    )
-    target_include_directories(TTRuntimeTTNN PUBLIC
-      ${PROJECT_SOURCE_DIR}/runtime/include
-      ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
-    )
-    target_include_directories(TTRuntimeTTNN PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
-    target_link_libraries(TTRuntimeTTNN PUBLIC TTNN_LIBRARY)
-    add_dependencies(TTRuntimeTTNN TTNN_LIBRARY tt-metal FBS_GENERATION)
+    add_subdirectory(ttnn)
   else()
     add_library(TTRuntimeTTNN INTERFACE)
   endif()
   if (TT_RUNTIME_ENABLE_TTMETAL)
-    add_library(TTRuntimeTTMetal STATIC ttmetal/runtime.cpp)
-    target_include_directories(TTRuntimeTTMetal PUBLIC ${PROJECT_SOURCE_DIR}/runtime/include ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common)
-    add_dependencies(TTRuntimeTTMetal tt-metal FBS_GENERATION)
+    add_subdirectory(ttmetal)
   else()
     add_library(TTRuntimeTTMetal INTERFACE)
   endif()
@@ -33,11 +18,12 @@ else()
   add_library(TTRuntimeTTMetal INTERFACE)
 endif()
 
-add_library(TTRuntime STATIC binary.cpp runtime.cpp)
-if (TT_RUNTIME_ENABLE_TTNN)
+message(STATUS "TT_RUNTIME_ENABLE_TTNN: ${TT_RUNTIME_ENABLE_TTNN} ${TT_RUNTIME_ENABLE_TTMETAL}")
+add_library(TTRuntime STATIC runtime.cpp)
+if (TTMLIR_ENABLE_RUNTIME AND TT_RUNTIME_ENABLE_TTNN)
   target_compile_definitions(TTRuntime PUBLIC TT_RUNTIME_ENABLE_TTNN)
 endif()
-if (TT_RUNTIME_ENABLE_TTMETAL)
+if (TTMLIR_ENABLE_RUNTIME AND TT_RUNTIME_ENABLE_TTMETAL)
   target_compile_definitions(TTRuntime PUBLIC TT_RUNTIME_ENABLE_TTMETAL)
 endif()
 target_include_directories(TTRuntime
@@ -51,3 +37,11 @@ target_link_libraries(TTRuntime
     TTRuntimeTTMetal
 )
 add_dependencies(TTRuntime FBS_GENERATION)
+
+add_library(TTBinary STATIC binary.cpp)
+target_include_directories(TTBinary
+  PUBLIC
+    ${PROJECT_SOURCE_DIR}/runtime/include
+    ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
+)
+add_dependencies(TTBinary FBS_GENERATION)
diff --git a/runtime/lib/binary.cpp b/runtime/lib/binary.cpp
index bd25468520..acf7aceeed 100644
--- a/runtime/lib/binary.cpp
+++ b/runtime/lib/binary.cpp
@@ -10,6 +10,8 @@
 #include "tt/runtime/utils.h"
 #include "ttmlir/Target/Common/system_desc_bfbs_generated.h"
 #include "ttmlir/Target/Common/system_desc_generated.h"
+#include "ttmlir/Target/TTMetal/Target.h"
+#include "ttmlir/Target/TTMetal/binary_bfbs_generated.h"
 #include "ttmlir/Target/TTNN/Target.h"
 #include "ttmlir/Target/TTNN/binary_bfbs_generated.h"
 
@@ -64,6 +66,74 @@ std::string asJson(Flatbuffer binary) {
       ::tt::target::ttnn::TTNNBinaryBinarySchema::size());
 }
 
+std::vector<TensorDesc> getProgramInputs(Flatbuffer binary,
+                                         std::uint32_t programIndex) {
+  std::vector<TensorDesc> inputs;
+  auto const *program = getBinary(binary)->programs()->Get(programIndex);
+  for (auto const *input : *program->inputs()) {
+    TensorDesc desc;
+    desc.shape = {input->desc()->shape()->begin(),
+                  input->desc()->shape()->end()};
+    desc.stride = {input->desc()->layout()->stride()->begin(),
+                   input->desc()->layout()->stride()->end()};
+    desc.itemsize = ::tt::runtime::utils::dataTypeElementSize(
+        input->desc()->layout()->memory_desc()->data_type());
+    desc.dataType = input->desc()->layout()->memory_desc()->data_type();
+    inputs.push_back(desc);
+  }
+  return inputs;
+}
+
+std::vector<TensorDesc> getProgramOutputs(Flatbuffer binary,
+                                          std::uint32_t programIndex) {
+  std::vector<TensorDesc> outputs;
+  auto const *program = getBinary(binary)->programs()->Get(programIndex);
+  for (auto const *output : *program->outputs()) {
+    TensorDesc desc;
+    desc.shape = {output->desc()->shape()->begin(),
+                  output->desc()->shape()->end()};
+    desc.stride = {output->desc()->layout()->stride()->begin(),
+                   output->desc()->layout()->stride()->end()};
+    desc.itemsize = ::tt::runtime::utils::dataTypeElementSize(
+        output->desc()->layout()->memory_desc()->data_type());
+    desc.dataType = output->desc()->layout()->memory_desc()->data_type();
+    outputs.push_back(desc);
+  }
+  return outputs;
+}
+
+} // namespace ttnn
+
+namespace metal {
+
+::tt::target::metal::TTMetalBinary const *getBinary(Flatbuffer binary) {
+  bool isTTMetal =
+      ::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
+          binary.handle.get());
+  if (not isTTMetal) {
+    throw std::runtime_error("Unsupported binary format");
+  }
+  return ::tt::target::metal::GetSizePrefixedTTMetalBinary(binary.handle.get());
+}
+
+std::string getVersion(Flatbuffer binary) {
+  auto const *version = getBinary(binary)->version();
+  return std::to_string(version->major()) + "." +
+         std::to_string(version->minor()) + "." +
+         std::to_string(version->patch());
+}
+
+std::string_view getTTMLIRGitHash(Flatbuffer binary) {
+  return getBinary(binary)->ttmlir_git_hash()->c_str();
+}
+
+std::string asJson(Flatbuffer binary) {
+  return ::tt::runtime::asJson(
+      binary.handle.get(),
+      ::tt::target::metal::TTMetalBinaryBinarySchema::data(),
+      ::tt::target::metal::TTMetalBinaryBinarySchema::size());
+}
+
 std::vector<TensorDesc> getProgramInputs(Flatbuffer binary,
                                          std::uint32_t programIndex) {
   std::vector<TensorDesc> inputs;
@@ -100,7 +170,7 @@ std::vector<TensorDesc> getProgramOutputs(Flatbuffer binary,
   return outputs;
 }
 
-} // namespace ttnn
+} // namespace metal
 
 namespace system_desc {
 
@@ -140,7 +210,7 @@ Flatbuffer Flatbuffer::loadFromPath(char const *path) {
 
   std::streampos size = fbb.tellg();
   fbb.seekg(0, std::ios::beg);
-  auto buffer = utils::malloc_shared(size);
+  auto buffer = ::tt::runtime::utils::malloc_shared(size);
   fbb.read(static_cast<char *>(buffer.get()), size);
   return Flatbuffer(buffer);
 }
@@ -159,6 +229,11 @@ std::string_view Flatbuffer::getFileIdentifier() const {
     return ::tt::target::ttnn::TTNNBinaryIdentifier();
   }
 
+  if (::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
+          handle.get())) {
+    return ::tt::target::metal::TTMetalBinaryIdentifier();
+  }
+
   if (::tt::target::SizePrefixedSystemDescRootBufferHasIdentifier(
           handle.get())) {
     return ::tt::target::SystemDescRootIdentifier();
@@ -173,6 +248,11 @@ std::string Flatbuffer::getVersion() const {
     return ttnn::getVersion(*this);
   }
 
+  if (::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
+          handle.get())) {
+    return metal::getVersion(*this);
+  }
+
   if (::tt::target::SizePrefixedSystemDescRootBufferHasIdentifier(
           handle.get())) {
     return system_desc::getVersion(*this);
@@ -187,6 +267,11 @@ std::string_view Flatbuffer::getTTMLIRGitHash() const {
     return ttnn::getTTMLIRGitHash(*this);
   }
 
+  if (::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
+          handle.get())) {
+    return metal::getTTMLIRGitHash(*this);
+  }
+
   if (::tt::target::SizePrefixedSystemDescRootBufferHasIdentifier(
           handle.get())) {
     return system_desc::getTTMLIRGitHash(*this);
@@ -201,6 +286,11 @@ std::string Flatbuffer::asJson() const {
     return ttnn::asJson(*this);
   }
 
+  if (::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
+          handle.get())) {
+    return metal::asJson(*this);
+  }
+
   if (::tt::target::SizePrefixedSystemDescRootBufferHasIdentifier(
           handle.get())) {
     return system_desc::asJson(*this);
@@ -224,6 +314,11 @@ Binary::getProgramInputs(std::uint32_t programIndex) const {
     return ttnn::getProgramInputs(*this, programIndex);
   }
 
+  if (::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
+          handle.get())) {
+    return metal::getProgramInputs(*this, programIndex);
+  }
+
   throw std::runtime_error("Unsupported binary format");
 }
 
@@ -234,6 +329,11 @@ Binary::getProgramOutputs(std::uint32_t programIndex) const {
     return ttnn::getProgramOutputs(*this, programIndex);
   }
 
+  if (::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
+          handle.get())) {
+    return metal::getProgramOutputs(*this, programIndex);
+  }
+
   throw std::runtime_error("Unsupported binary format");
 }
 
diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp
index 200f587fc6..6612cc7e8c 100644
--- a/runtime/lib/runtime.cpp
+++ b/runtime/lib/runtime.cpp
@@ -6,6 +6,11 @@
 #include "tt/runtime/utils.h"
 #include "ttmlir/Version.h"
 
+#if defined(TT_RUNTIME_ENABLE_TTNN) && defined(TT_RUNTIME_ENABLE_TTMETAL)
+#error                                                                         \
+    "Only one of TT_RUNTIME_ENABLE_TTNN and TT_RUNTIME_ENABLE_TTMETAL can be defined"
+#endif
+
 #if defined(TT_RUNTIME_ENABLE_TTNN)
 #include "tt/runtime/detail/ttnn.h"
 #elif defined(TT_RUNTIME_ENABLE_TTMETAL)
@@ -16,6 +21,8 @@ namespace tt::runtime {
 std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc() {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   return ::tt::runtime::ttnn::getCurrentSystemDesc();
+#elif defined(TT_RUNTIME_ENABLE_TTMETAL)
+  return ::tt::runtime::ttmetal::getCurrentSystemDesc();
 #else
   throw std::runtime_error("runtime is not enabled");
 #endif
@@ -25,17 +32,26 @@ Tensor createTensor(std::shared_ptr<void> data,
                     std::vector<std::uint32_t> const &shape,
                     std::vector<std::uint32_t> const &stride,
                     std::uint32_t itemsize, ::tt::target::DataType dataType) {
+  assert(not shape.empty());
+  assert(not stride.empty());
+  assert(itemsize > 0);
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   return ::tt::runtime::ttnn::createTensor(data, shape, stride, itemsize,
                                            dataType);
+#elif defined(TT_RUNTIME_ENABLE_TTMETAL)
+  return ::tt::runtime::ttmetal::createTensor(data, shape, stride, itemsize,
+                                              dataType);
 #else
   throw std::runtime_error("runtime is not enabled");
 #endif
 }
 
-Device openDevice(std::vector<int> deviceIds) {
+Device openDevice(std::vector<int> const &deviceIds,
+                  std::vector<std::uint8_t> const &numHWCQs) {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
-  return ::tt::runtime::ttnn::openDevice(deviceIds);
+  return ::tt::runtime::ttnn::openDevice(deviceIds, numHWCQs);
+#elif defined(TT_RUNTIME_ENABLE_TTMETAL)
+  return ::tt::runtime::ttmetal::openDevice(deviceIds, numHWCQs);
 #else
   throw std::runtime_error("runtime is not enabled");
 #endif
@@ -44,6 +60,8 @@ Device openDevice(std::vector<int> deviceIds) {
 void closeDevice(Device device) {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   return ::tt::runtime::ttnn::closeDevice(device);
+#elif defined(TT_RUNTIME_ENABLE_TTMETAL)
+  return ::tt::runtime::ttmetal::closeDevice(device);
 #else
   throw std::runtime_error("runtime is not enabled");
 #endif
@@ -56,6 +74,10 @@ Event submit(Device deviceHandle, Binary executableHandle,
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   return ::tt::runtime::ttnn::submit(deviceHandle, executableHandle,
                                      programIndex, inputHandles, outputHandles);
+#elif defined(TT_RUNTIME_ENABLE_TTMETAL)
+  return ::tt::runtime::ttmetal::submit(deviceHandle, executableHandle,
+                                        programIndex, inputHandles,
+                                        outputHandles);
 #else
   throw std::runtime_error("runtime is not enabled");
 #endif
diff --git a/runtime/lib/ttmetal/CMakeLists.txt b/runtime/lib/ttmetal/CMakeLists.txt
new file mode 100644
index 0000000000..a8d02fa34a
--- /dev/null
+++ b/runtime/lib/ttmetal/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_library(TTRuntimeTTMetal
+  STATIC
+  runtime.cpp
+  command_queue.cpp
+)
+target_include_directories(TTRuntimeTTMetal PUBLIC
+  ${PROJECT_SOURCE_DIR}/runtime/include
+  ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
+)
+target_include_directories(TTRuntimeTTMetal PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
+add_dependencies(TTRuntimeTTMetal tt-metal FBS_GENERATION)
diff --git a/runtime/lib/ttmetal/command_queue.cpp b/runtime/lib/ttmetal/command_queue.cpp
new file mode 100644
index 0000000000..97b7277344
--- /dev/null
+++ b/runtime/lib/ttmetal/command_queue.cpp
@@ -0,0 +1,367 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <unordered_map>
+
+#include "tt/runtime/detail/ttmetal.h"
+#include "tt/runtime/runtime.h"
+#include "tt/runtime/utils.h"
+
+#include "ttmlir/Target/TTMetal/Target.h"
+#include "ttmlir/Version.h"
+
+// Needed to construct ShardedBufferConfig
+#pragma clang diagnostic ignored "-Wc++20-designator"
+
+namespace tt::runtime::ttmetal {
+
+struct CQExecutor {
+  ::tt::tt_metal::Device *device;
+  std::unordered_map<std::uint32_t, std::shared_ptr<::tt::tt_metal::Buffer>>
+      buffers;
+  std::unordered_map<std::uint32_t, std::shared_ptr<::tt::tt_metal::Event>>
+      events;
+  ::tt::tt_metal::CommandQueue *cq;
+
+  CQExecutor(
+      ::tt::tt_metal::Device *device, std::size_t cq_id,
+      std::vector<std::pair<std::uint32_t,
+                            std::shared_ptr<::tt::tt_metal::Buffer>>> const
+          &inputs,
+      std::vector<std::pair<std::uint32_t,
+                            std::shared_ptr<::tt::tt_metal::Buffer>>> const
+          &outputs);
+
+  std::shared_ptr<::tt::tt_metal::Event>
+  execute(::tt::target::metal::CommandQueue const *commandQueue);
+  void execute(::tt::target::metal::Command const *command);
+  void execute(::tt::target::metal::EnqueueProgramCommand const *command);
+  void execute(::tt::target::metal::EnqueueWriteBufferCommand const *command);
+  void execute(::tt::target::metal::EnqueueReadBufferCommand const *command);
+  void execute(::tt::target::metal::CreateBufferCommand const *command);
+  void execute(::tt::target::metal::DeallocateBufferCommand const *command);
+  void execute(::tt::target::metal::CreateEventCommand const *command);
+  void execute(::tt::target::metal::EnqueueRecordEventCommand const *command);
+  void execute(::tt::target::metal::EnqueueWaitForEventCommand const *command);
+  void execute(::tt::target::metal::EventSynchronizeCommand const *command);
+  void execute(::tt::target::metal::EventQueryCommand const *command);
+  void execute(::tt::target::metal::FinishCommand const *command);
+};
+
+CQExecutor::CQExecutor(
+    ::tt::tt_metal::Device *device, std::size_t cq_id,
+    std::vector<
+        std::pair<std::uint32_t, std::shared_ptr<::tt::tt_metal::Buffer>>> const
+        &inputs,
+    std::vector<
+        std::pair<std::uint32_t, std::shared_ptr<::tt::tt_metal::Buffer>>> const
+        &outputs)
+    : device(device) {
+  for (std::size_t i = 0; i < inputs.size(); ++i) {
+    buffers[inputs[i].first] = inputs[i].second;
+  }
+
+  for (std::size_t i = 0; i < outputs.size(); ++i) {
+    buffers[outputs[i].first] = outputs[i].second;
+  }
+
+  cq = &device->command_queue(cq_id);
+}
+
+std::shared_ptr<::tt::tt_metal::Event>
+CQExecutor::execute(::tt::target::metal::CommandQueue const *commandQueue) {
+  for (::tt::target::metal::Command const *command :
+       *commandQueue->commands()) {
+    execute(command);
+  }
+
+  std::shared_ptr<::tt::tt_metal::Event> event =
+      std::make_shared<::tt::tt_metal::Event>();
+  ::tt::tt_metal::EnqueueRecordEvent(*cq, event);
+  return event;
+}
+
+void CQExecutor::execute(::tt::target::metal::Command const *command) {
+  switch (command->type_type()) {
+  case ::tt::target::metal::CommandType::EnqueueProgramCommand: {
+    execute(command->type_as_EnqueueProgramCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::EnqueueWriteBufferCommand: {
+    execute(command->type_as_EnqueueWriteBufferCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::EnqueueReadBufferCommand: {
+    execute(command->type_as_EnqueueReadBufferCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::CreateBufferCommand: {
+    execute(command->type_as_CreateBufferCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::DeallocateBufferCommand: {
+    execute(command->type_as_DeallocateBufferCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::CreateEventCommand: {
+    execute(command->type_as_CreateEventCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::EnqueueRecordEventCommand: {
+    execute(command->type_as_EnqueueRecordEventCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::EnqueueWaitForEventCommand: {
+    execute(command->type_as_EnqueueWaitForEventCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::EventSynchronizeCommand: {
+    execute(command->type_as_EventSynchronizeCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::EventQueryCommand: {
+    execute(command->type_as_EventQueryCommand());
+    break;
+  }
+  case ::tt::target::metal::CommandType::FinishCommand: {
+    execute(command->type_as_FinishCommand());
+    break;
+  }
+  default:
+    throw std::runtime_error("Unsupported command type");
+    break;
+  }
+}
+
+static CoreRangeSet toCoreRangeSet(
+    ::flatbuffers::Vector<tt::target::Dim2dRange const *> const *coreRangeSet) {
+  std::set<CoreRange> coreRanges;
+  for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) {
+    CoreCoord start(coreRange->loc().x(), coreRange->loc().y());
+    CoreCoord end(coreRange->loc().x() + coreRange->size().x(),
+                  coreRange->loc().y() + coreRange->size().y());
+    coreRanges.emplace(start, end);
+  }
+  return CoreRangeSet(coreRanges);
+}
+
+static void writeFile(std::string const &fileName, char const *data,
+                      std::size_t size) {
+  std::ofstream file(fileName);
+  file.write(data, size);
+  file.close();
+}
+
+static std::variant<DataMovementConfig, ComputeConfig, EthernetConfig>
+createKernelConfig(::tt::target::metal::KernelSource const *kernelSource) {
+  switch (kernelSource->source_type()) {
+  case ::tt::target::metal::SourceType::Noc0: {
+    return ::tt::tt_metal::ReaderDataMovementConfig();
+  }
+  case ::tt::target::metal::SourceType::Noc1: {
+    return ::tt::tt_metal::WriterDataMovementConfig();
+  }
+  case ::tt::target::metal::SourceType::Tensix: {
+    return ::tt::tt_metal::ComputeConfig();
+  }
+  default:
+    break;
+  }
+  throw std::runtime_error("Unsupported kernel source type");
+}
+
+static ::tt::DataFormat toDataFormat(::tt::target::DataType dataType) {
+  switch (dataType) {
+  case ::tt::target::DataType::Float32:
+    return ::tt::DataFormat::Float32;
+  case ::tt::target::DataType::Float16:
+    return ::tt::DataFormat::Float16;
+  case ::tt::target::DataType::BFloat16:
+    return ::tt::DataFormat::Float16_b;
+  case ::tt::target::DataType::UInt32:
+    return ::tt::DataFormat::UInt32;
+  case ::tt::target::DataType::UInt16:
+    return ::tt::DataFormat::UInt16;
+  case ::tt::target::DataType::UInt8:
+    return ::tt::DataFormat::UInt8;
+  default:
+    throw std::runtime_error("Unsupported data type");
+  }
+}
+
+static ::tt::tt_metal::CircularBufferConfig createCircularBufferConfig(
+    ::tt::target::CBRef const *cbRef,
+    std::unordered_map<std::uint32_t,
+                       std::shared_ptr<::tt::tt_metal::Buffer>> const
+        &buffers) {
+  std::uint32_t totalSize =
+      cbRef->desc()->memory_desc()->size() * cbRef->desc()->num_buffers();
+  ::tt::DataFormat dataFormat =
+      toDataFormat(cbRef->desc()->memory_desc()->data_type());
+  assert(cbRef->associated_tensor_global_id());
+  return CircularBufferConfig(totalSize, {{cbRef->desc()->port(), dataFormat}},
+                              *buffers.at(cbRef->associated_tensor_global_id()))
+      .set_page_size(cbRef->desc()->port(),
+                     cbRef->desc()->memory_desc()->size());
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::EnqueueProgramCommand const *command) {
+  static int gKernelId = 0;
+
+  ::tt::tt_metal::Program program = ::tt::tt_metal::CreateProgram();
+
+  for (::tt::target::metal::KernelDesc const *kernelDesc :
+       *command->program()->kernels()) {
+    ::tt::target::metal::KernelSource const *kernelSource =
+        kernelDesc->kernel_as_KernelSource();
+    assert(kernelSource && "Only source kernels supported for now");
+    // We need a new API to create a kernel from source string, or directly from
+    // binary
+    std::string fileName =
+        "/tmp/ttmlir_" + std::to_string(gKernelId++) + ".cpp";
+    writeFile(fileName, kernelSource->source()->c_str(),
+              kernelSource->source()->size());
+    CoreRangeSet coreRange = toCoreRangeSet(kernelDesc->core_range_set());
+    std::variant<DataMovementConfig, ComputeConfig, EthernetConfig> config =
+        createKernelConfig(kernelSource);
+    ::tt::tt_metal::KernelHandle handle =
+        ::tt::tt_metal::CreateKernel(program, fileName, coreRange, config);
+    (void)handle; // only needed for runtime args, which aren't supported yet
+
+    for (::tt::target::CBRef const *cbRef : *kernelDesc->cbs()) {
+      ::tt::tt_metal::CircularBufferConfig config =
+          createCircularBufferConfig(cbRef, buffers);
+      ::tt::tt_metal::CreateCircularBuffer(program, coreRange, config);
+    }
+  }
+
+  constexpr bool blocking = false;
+  ::tt::tt_metal::EnqueueProgram(*cq, program, blocking);
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::EnqueueWriteBufferCommand const *command) {
+  assert(command->src()->desc()->constant_data() != nullptr &&
+         "Only constant data supported");
+  throw std::runtime_error("Unsupported EnqueueWriteBufferCommand");
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::EnqueueReadBufferCommand const *command) {
+  // Maybe we will need this in the future, like paging to system mem?
+  throw std::runtime_error("Unsupported EnqueueReadBufferCommand");
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::CreateBufferCommand const *command) {
+  ::tt::target::LayoutDesc const *layout = command->ref()->desc()->layout();
+  CoreRangeSet coreRangeSet = toCoreRangeSet(layout->core_range_set());
+  auto shardRank = layout->memory_desc()->shape()->size();
+  std::array<uint32_t, 2> shardShape;
+  shardShape[1] = layout->memory_desc()->shape()->Get(shardRank - 1) *
+                  layout->memory_desc()->tile_shape()->x();
+  shardShape[0] = layout->memory_desc()->tile_shape()->y();
+  for (unsigned i = 0; i < shardRank - 1; ++i) {
+    shardShape[0] *= layout->memory_desc()->shape()->Get(i);
+  }
+  ShardSpec shardSpec(coreRangeSet, shardShape);
+
+  auto tensorRank = layout->stride()->size();
+  std::array<uint32_t, 2> tensorShape;
+  assert(layout->stride()->size() >= 2);
+  tensorShape[1] = layout->stride()->Get(tensorRank - 2);
+  tensorShape[0] =
+      layout->stride()->Get(0) * command->ref()->desc()->shape()->Get(0);
+
+  auto pageShape = shardShape;
+  ShardSpecBuffer shardSpecBuffer(shardSpec, pageShape, tensorShape);
+
+  uint64_t gridVolume = 1;
+  for (auto dim2dRange : *layout->core_range_set()) {
+    gridVolume *= dim2dRange->size().x() * dim2dRange->size().y();
+  }
+
+  assert(layout->memory_desc()->memory_space() ==
+             ::tt::target::MemorySpace::DeviceDRAM ||
+         layout->memory_desc()->memory_space() ==
+             ::tt::target::MemorySpace::DeviceL1);
+  BufferType bufferType = layout->memory_desc()->memory_space() ==
+                                  ::tt::target::MemorySpace::DeviceDRAM
+                              ? BufferType::DRAM
+                              : BufferType::L1;
+  uint64_t size = gridVolume * layout->memory_desc()->size();
+  auto shardedBufferConfig = ShardedBufferConfig{
+      .device = device,
+      .size = size,
+      .page_size = size,
+      .buffer_type = bufferType,
+      .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED,
+      .shard_parameters = shardSpecBuffer,
+  };
+  std::shared_ptr<::tt::tt_metal::Buffer> buffer =
+      ::tt::tt_metal::CreateBuffer(shardedBufferConfig);
+  buffer->set_address(command->ref()->address());
+  buffers[command->ref()->global_id()] = buffer;
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::DeallocateBufferCommand const *command) {
+  auto iter = buffers.find(command->ref()->global_id());
+  assert(iter != buffers.end() && "Buffer not allocated");
+  assert(iter->second != nullptr && "Buffer already deallocated");
+  ::tt::tt_metal::DeallocateBuffer(*iter->second);
+  iter->second.reset();
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::CreateEventCommand const *command) {
+  assert(events.find(command->ref()->global_id()) == events.end());
+  events[command->ref()->global_id()] =
+      std::make_shared<::tt::tt_metal::Event>();
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::EnqueueRecordEventCommand const *command) {
+  auto event = events.at(command->ref()->global_id());
+  ::tt::tt_metal::EnqueueRecordEvent(*cq, event);
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::EnqueueWaitForEventCommand const *command) {
+  auto event = events.at(command->ref()->global_id());
+  ::tt::tt_metal::EnqueueWaitForEvent(*cq, event);
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::EventSynchronizeCommand const *command) {
+  auto event = events.at(command->ref()->global_id());
+  ::tt::tt_metal::EventSynchronize(event);
+}
+
+void CQExecutor::execute(
+    ::tt::target::metal::EventQueryCommand const *command) {
+  auto event = events.at(command->ref()->global_id());
+  (void)::tt::tt_metal::EventQuery(
+      event); // todo, we need flatbuffer support for tracking and doing
+              // something with the result
+}
+
+void CQExecutor::execute(::tt::target::metal::FinishCommand const *) {
+  ::tt::tt_metal::Finish(*cq);
+}
+
+std::shared_ptr<::tt::tt_metal::Event> executeCommandQueue(
+    ::tt::tt_metal::Device *device,
+    ::tt::target::metal::CommandQueue const *commandQueue, std::size_t cq_id,
+    std::vector<
+        std::pair<std::uint32_t, std::shared_ptr<::tt::tt_metal::Buffer>>> const
+        &inputs,
+    std::vector<
+        std::pair<std::uint32_t, std::shared_ptr<::tt::tt_metal::Buffer>>> const
+        &outputs) {
+  CQExecutor executor(device, cq_id, inputs, outputs);
+  return executor.execute(commandQueue);
+}
+} // namespace tt::runtime::ttmetal
diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp
index d08cdc0024..0101259970 100644
--- a/runtime/lib/ttmetal/runtime.cpp
+++ b/runtime/lib/ttmetal/runtime.cpp
@@ -1,3 +1,182 @@
 // SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
+
+#include "tt/runtime/runtime.h"
+#include "tt/runtime/detail/ttmetal.h"
+#include "tt/runtime/utils.h"
+
+#include "ttmlir/Target/TTMetal/Target.h"
+#include "ttmlir/Version.h"
+
+namespace tt::runtime::ttmetal {
+
+using Events = std::vector<std::shared_ptr<::tt::tt_metal::Event>>;
+using DeviceMesh = std::vector<::tt::tt_metal::Device *>;
+
+static ::tt::target::Arch toFlatbuffer(::tt::ARCH arch) {
+  switch (arch) {
+  case ::tt::ARCH::GRAYSKULL:
+    return ::tt::target::Arch::Grayskull;
+  case ::tt::ARCH::WORMHOLE_B0:
+    return ::tt::target::Arch::Wormhole_b0;
+  case ::tt::ARCH::BLACKHOLE:
+    return ::tt::target::Arch::Blackhole;
+  default:
+    break;
+  }
+
+  throw std::runtime_error("Unsupported arch");
+}
+
+static ::tt::target::Dim2d toFlatbuffer(CoreCoord coreCoord) {
+  return ::tt::target::Dim2d(coreCoord.y, coreCoord.x);
+}
+
+static ::tt::target::metal::TTMetalBinary const *getBinary(Flatbuffer binary) {
+  bool isTTMetal =
+      ::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
+          binary.handle.get());
+  if (not isTTMetal) {
+    throw std::runtime_error("Unsupported binary format");
+  }
+  return ::tt::target::metal::GetSizePrefixedTTMetalBinary(binary.handle.get());
+}
+
+std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc() {
+  ::tt::tt_metal::Device *device = ::tt::tt_metal::CreateDevice(0);
+  std::vector<int> chipIds = {
+      device->id(),
+  };
+  ::flatbuffers::FlatBufferBuilder fbb;
+  ::ttmlir::Version ttmlirVersion = ::ttmlir::getVersion();
+  ::tt::target::Version version(ttmlirVersion.major, ttmlirVersion.minor,
+                                ttmlirVersion.patch);
+  ::tt::target::Dim2d deviceGrid = toFlatbuffer(device->logical_grid_size());
+  std::vector<::flatbuffers::Offset<tt::target::ChipDesc>> chipDescs = {
+      ::tt::target::CreateChipDesc(
+          fbb, toFlatbuffer(device->arch()), &deviceGrid, (1 << 20), 12,
+          (1 << 20), L1_ALIGNMENT, PCIE_ALIGNMENT, DRAM_ALIGNMENT),
+  };
+  std::vector<uint32_t> chipDescIndices = {
+      0,
+  };
+  ::tt::target::ChipCapability chipCapability =
+      ::tt::target::ChipCapability::PCIE;
+  if (device->is_mmio_capable()) {
+    chipCapability = chipCapability | ::tt::target::ChipCapability::HostMMIO;
+  }
+  std::vector<::tt::target::ChipCapability> chipCapabilities = {
+      chipCapability,
+  };
+  std::vector<::tt::target::ChipCoord> chipCoord = {
+      ::tt::target::ChipCoord(0, 0, 0, 0),
+  };
+  std::vector<::tt::target::ChipChannel> chipChannel;
+  auto systemDesc = ::tt::target::CreateSystemDescDirect(
+      fbb, &chipDescs, &chipDescIndices, &chipCapabilities, &chipCoord,
+      &chipChannel);
+  auto root = ::tt::target::CreateSystemDescRootDirect(
+      fbb, &version, ::ttmlir::getGitHash(), "unknown", systemDesc);
+  ::tt::target::FinishSizePrefixedSystemDescRootBuffer(fbb, root);
+  ::flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
+  if (not ::tt::target::VerifySizePrefixedSystemDescRootBuffer(verifier)) {
+    throw std::runtime_error("Failed to verify system desc root buffer");
+  }
+  uint8_t *buf = fbb.GetBufferPointer();
+  auto size = fbb.GetSize();
+  auto handle = utils::malloc_shared(size);
+  std::memcpy(handle.get(), buf, size);
+  ::tt::tt_metal::CloseDevice(device);
+  return std::make_pair(SystemDesc(handle), chipIds);
+}
+
+Tensor createTensor(std::shared_ptr<void> data,
+                    std::vector<std::uint32_t> const &shape,
+                    std::vector<std::uint32_t> const &stride,
+                    std::uint32_t itemsize, ::tt::target::DataType dataType) {
+  std::shared_ptr<TensorDesc> desc = std::make_shared<TensorDesc>();
+  desc->shape = shape;
+  desc->stride = stride;
+  desc->itemsize = itemsize;
+  desc->dataType = dataType;
+  return Tensor(static_pointer_cast<void>(desc), data);
+}
+
+Device openDevice(std::vector<int> const &deviceIds,
+                  std::vector<std::uint8_t> const &numHWCQs) {
+  assert(numHWCQs.empty() || numHWCQs.size() == deviceIds.size());
+  std::shared_ptr<DeviceMesh> deviceMesh = std::make_shared<DeviceMesh>();
+  int i = 0;
+  for (int deviceId : deviceIds) {
+    uint8_t num_hw_cqs = numHWCQs.empty() ? 1 : numHWCQs[i];
+    deviceMesh->push_back(CreateDevice(deviceId, num_hw_cqs));
+    ++i;
+  }
+  return static_pointer_cast<void>(deviceMesh);
+}
+
+void closeDevice(Device device) {
+  DeviceMesh &deviceMesh = device.as<DeviceMesh>();
+  for (::tt::tt_metal::Device *device : deviceMesh) {
+    ::tt::tt_metal::CloseDevice(device);
+  }
+}
+
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex,
+             std::vector<Tensor> const &inputHandles,
+             std::vector<Tensor> const &outputHandles) {
+  ::tt::target::metal::TTMetalBinary const &fbb = *getBinary(executableHandle);
+  ::tt::target::metal::Program const *program =
+      fbb.programs()->Get(programIndex);
+  std::vector<std::pair<std::uint32_t, std::shared_ptr<::tt::tt_metal::Buffer>>>
+      inputs;
+  inputs.reserve(inputHandles.size());
+  assert(inputHandles.size() == program->inputs()->size() &&
+         "Input size mismatch");
+  for (unsigned i = 0; i < inputHandles.size(); ++i) {
+    inputs.emplace_back(
+        program->inputs()->Get(i)->global_id(),
+        static_pointer_cast<::tt::tt_metal::Buffer>(inputHandles[i].handle));
+  }
+
+  std::vector<std::pair<std::uint32_t, std::shared_ptr<::tt::tt_metal::Buffer>>>
+      outputs;
+  outputs.reserve(outputHandles.size());
+  assert(outputHandles.size() == program->outputs()->size() &&
+         "Output size mismatch");
+  for (unsigned i = 0; i < outputHandles.size(); ++i) {
+    outputs.emplace_back(
+        program->outputs()->Get(i)->global_id(),
+        static_pointer_cast<::tt::tt_metal::Buffer>(outputHandles[i].handle));
+  }
+
+  DeviceMesh &deviceMesh = deviceHandle.as<DeviceMesh>();
+  std::shared_ptr<Events> events = std::make_shared<Events>();
+  std::size_t cq_id = 0;
+  assert(program->device_programs()->size() == deviceMesh.size() &&
+         "Device programs size mismatch");
+  for (std::size_t i = 0; i < program->device_programs()->size(); ++i) {
+    ::tt::tt_metal::Device *device = deviceMesh[i];
+    ::tt::target::metal::DeviceProgram const *deviceProgram =
+        program->device_programs()->Get(i);
+    for (::tt::target::metal::CommandQueue const *cq :
+         *deviceProgram->command_queues()) {
+      events->push_back(
+          executeCommandQueue(device, cq, cq_id, inputs, outputs));
+      ++cq_id;
+    }
+  }
+
+  return static_pointer_cast<void>(events);
+}
+
+void wait(Event event) {
+  Events events = event.as<Events>();
+  for (auto e : events) {
+    ::tt::tt_metal::EventSynchronize(e);
+  }
+}
+
+} // namespace tt::runtime::ttmetal
diff --git a/runtime/lib/ttnn/CMakeLists.txt b/runtime/lib/ttnn/CMakeLists.txt
new file mode 100644
index 0000000000..01ec4d9ce7
--- /dev/null
+++ b/runtime/lib/ttnn/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_library(TTRuntimeTTNN
+  STATIC
+  runtime.cpp
+  program.cpp
+)
+target_compile_options(TTRuntimeTTNN PRIVATE -mavx -mavx2)
+target_include_directories(TTRuntimeTTNN PUBLIC
+  ${PROJECT_SOURCE_DIR}/runtime/include
+  ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
+)
+target_include_directories(TTRuntimeTTNN PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
+target_link_libraries(TTRuntimeTTNN PUBLIC TTNN_LIBRARY)
+add_dependencies(TTRuntimeTTNN TTNN_LIBRARY tt-metal FBS_GENERATION)
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index be0d132232..59ce43f54a 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -8,59 +8,175 @@
 
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/runtime.h"
+#include "utils.h"
 
 #include "ttmlir/Target/TTNN/Target.h"
 #include "ttmlir/Version.h"
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wignored-qualifiers"
+// Including this in ttnn.h causes multiple definition linker error
+// due to non-inlined function definitions
+#include "ttnn/operations/unary.hpp"
+#pragma clang diagnostic pop
+
 // It seems like `ttnn::to_layout` cannot be called inside of the
 // `tt::runtime::ttnn` namespace.  TTNN uses a lot of metaprogramming and for
 // some reason a static_assert fails when this is called from within our
 // namespace.
 ttnn::Tensor tilize(ttnn::Tensor const &input) {
-  ttnn::Tensor unsqueezeTensor = ttnn::unsqueeze_to_4D(input);
-  return ttnn::to_layout(unsqueezeTensor, ttnn::TILE_LAYOUT, std::nullopt,
+  return ttnn::to_layout(input, ::ttnn::TILE_LAYOUT, std::nullopt, std::nullopt,
+                         (Device *)nullptr);
+}
+
+ttnn::Tensor untilize(ttnn::Tensor const &input) {
+  return ttnn::to_layout(input, ::ttnn::ROW_MAJOR_LAYOUT, std::nullopt,
                          std::nullopt, (Device *)nullptr);
 }
 
 namespace tt::runtime::ttnn {
+
+static ::ttnn::Tensor convertDataType(const ::ttnn::Tensor &input,
+                                      const ::ttnn::DataType &targetDataType) {
+  const ::ttnn::StorageType storageType = input.storage_type();
+  if (storageType == ::tt::tt_metal::StorageType::BORROWED) {
+    return ::ttnn::to_dtype(input, targetDataType);
+  } else if (storageType == ::tt::tt_metal::StorageType::DEVICE) {
+    if (input.get_layout() != ::ttnn::TILE_LAYOUT) {
+      // typecast op requires tilized tensor
+      ::ttnn::Tensor converted =
+          ::ttnn::typecast(::tilize(input), targetDataType);
+      // untilize and return
+      return ::untilize(converted);
+    }
+    return ::ttnn::typecast(input, targetDataType);
+  } else {
+    throw runtime_error("Unsupported storage type");
+  }
+}
+
+/* TODO: Blocked by issue #272, ideal flow is to determine tilize/untilize with
+ * tile_shape */
+static ::ttnn::Tensor
+updateLayoutAndDataType(const ::ttnn::Tensor &inputTensor,
+                        const ::ttnn::DataType targetDataType,
+                        const bool shouldTilize, const bool shouldUntilize) {
+  ::ttnn::Tensor outputTensor = inputTensor;
+  const bool shouldConvertDataType = inputTensor.get_dtype() != targetDataType;
+  // const int targetTileX = targetTileShape->x();
+  // const int targetTileY = targetTileShape->y();
+  // const bool shouldTilize =
+  //     targetTileX == 32 and targetTileY == 32 and
+  //     inputTensor.get_layout() == ::ttnn::ROW_MAJOR_LAYOUT;
+  // const bool shouldUntilize = (targetTileX != 32 or targetTileY != 32) and
+  //                             inputTensor.get_layout() ==
+  //                             ::ttnn::TILE_LAYOUT;
+  if (shouldTilize) {
+    outputTensor = ::tilize(outputTensor);
+  } else if (shouldUntilize) {
+    outputTensor = ::untilize(outputTensor);
+  }
+  if (shouldConvertDataType) {
+    outputTensor = convertDataType(outputTensor, targetDataType);
+  }
+  return outputTensor;
+}
+
+// TODO: right now hardcoding tilize/untilize, should determine with tile shape
+// blocked by issue #272
 static void
 run(::tt::target::ttnn::ToMemoryConfigOp const *op, ::ttnn::Device &device,
     std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors,
     std::list<::ttnn::Tensor> &tensorPool) {
-  if (op->out()->desc()->layout()->memory_desc()->memory_space() ==
-      ::tt::target::MemorySpace::System) {
-    auto &inputTensor = *liveTensors.at(op->in0()->global_id());
-    auto cpu = inputTensor.cpu();
-    ::ttnn::Tensor untilized;
-    if (op->out()->desc()->layout()->memory_desc()->data_type() ==
-        ::tt::target::DataType::Float32) {
-      untilized = ::tt::tt_metal::tensor_impl::to_layout<float>(
-          cpu, ::ttnn::ROW_MAJOR_LAYOUT);
-    } else if (op->out()->desc()->layout()->memory_desc()->data_type() ==
-               ::tt::target::DataType::BFloat16) {
-      untilized = ::tt::tt_metal::tensor_impl::to_layout<bfloat16>(
-          cpu, ::ttnn::ROW_MAJOR_LAYOUT);
-    } else {
-      throw std::runtime_error("Unsupported data type");
+  const ::ttnn::Tensor &inputTensor = *liveTensors.at(op->in0()->global_id());
+  assert(inputTensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED or
+         inputTensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE);
+
+  const ::tt::target::Dim2d *targetTileShape =
+      op->out()->desc()->layout()->memory_desc()->tile_shape();
+  TT_FATAL(utils::isValidTileShape(targetTileShape),
+           "Invalid tile shape ({}, {})", targetTileShape->x(),
+           targetTileShape->y());
+
+  ::tt::target::DataType targetDataType =
+      op->out()->desc()->layout()->memory_desc()->data_type();
+  ::ttnn::DataType targetDataTypeTTNN = utils::toTTNNDataType(targetDataType);
+
+  const ::tt::target::MemorySpace targetMemorySpace =
+      op->out()->desc()->layout()->memory_desc()->memory_space();
+
+  switch (targetMemorySpace) {
+  case ::tt::target::MemorySpace::System:
+  case ::tt::target::MemorySpace::SystemMMIO: {
+    ::ttnn::Tensor result;
+    if (inputTensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED) {
+      result =
+          updateLayoutAndDataType(inputTensor, targetDataTypeTTNN, false, true);
+    } else if (inputTensor.storage_type() ==
+               ::tt::tt_metal::StorageType::DEVICE) {
+      result = updateLayoutAndDataType(inputTensor.cpu(), targetDataTypeTTNN,
+                                       false, true);
     }
-    auto &outputTensor = *liveTensors.at(op->out()->global_id());
-    void *src = ::tt::tt_metal::get_raw_host_data_ptr(untilized);
+    ::ttnn::Tensor &outputTensor = *liveTensors.at(op->out()->global_id());
+    void *src = ::tt::tt_metal::get_raw_host_data_ptr(result);
     void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor);
-    std::uint32_t size = untilized.volume() * untilized.element_size();
+    std::uint32_t size = result.volume() * result.element_size();
     std::memcpy(dst, src, size);
-    return;
-  }
-  bool isL1 = op->in0()->desc()->layout()->memory_desc()->memory_space() ==
-              ::tt::target::MemorySpace::DeviceL1;
-  const auto memoryConfig =
-      isL1 ? ::ttnn::L1_MEMORY_CONFIG : ::ttnn::DRAM_MEMORY_CONFIG;
-  auto &inputTensor = *liveTensors.at(op->in0()->global_id());
-  ::ttnn::Tensor tilized = ::tilize(inputTensor);
-  auto deviceTensor = ::ttnn::to_device(tilized, &device, memoryConfig);
-  tensorPool.push_back(deviceTensor);
-  // auto [iter, inserted] =
-  liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
-  // assert(inserted && "Duplicate output tensor");
+    break;
+  }
+  case ::tt::target::MemorySpace::DeviceDRAM: {
+    ::tt::tt_metal::MemoryConfig memConfig = ::ttnn::DRAM_MEMORY_CONFIG;
+    if (inputTensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED) {
+      ::ttnn::Tensor result = inputTensor;
+      bool shouldTilize = true;
+      // device tilize requires BFLOAT16, if not then tilize on host
+      if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
+        result = ::tilize(result);
+        shouldTilize = false;
+      }
+      result = ::ttnn::to_device(result, &device, memConfig);
+      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                       false);
+      tensorPool.push_back(result);
+      liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    } else if (inputTensor.storage_type() ==
+               ::tt::tt_metal::StorageType::DEVICE) {
+      ::ttnn::Tensor result = updateLayoutAndDataType(
+          inputTensor, targetDataTypeTTNN, false, false);
+      result = ::ttnn::to_memory_config(result, memConfig, std::nullopt);
+      tensorPool.push_back(result);
+      liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    }
+    break;
+  }
+  // Currently similar to ::tt::target::MemorySpace::DeviceDRAM
+  // But will need it's own code path when we add support for sharding
+  case ::tt::target::MemorySpace::DeviceL1: {
+    ::tt::tt_metal::MemoryConfig memConfig = ::ttnn::L1_MEMORY_CONFIG;
+    if (inputTensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED) {
+      ::ttnn::Tensor result = inputTensor;
+      bool shouldTilize = true;
+      // device tilize requires BFLOAT16, if not then tilize on host
+      if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
+        result = ::tilize(result);
+        shouldTilize = false;
+      }
+      result = ::ttnn::to_device(result, &device, memConfig);
+      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                       false);
+      tensorPool.push_back(result);
+      liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    } else if (inputTensor.storage_type() ==
+               ::tt::tt_metal::StorageType::DEVICE) {
+      ::ttnn::Tensor result = updateLayoutAndDataType(
+          inputTensor, targetDataTypeTTNN, false, false);
+      result = ::ttnn::to_memory_config(result, memConfig, std::nullopt);
+      tensorPool.push_back(result);
+      liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    }
+    break;
+  }
+  }
 }
 
 static void
@@ -68,6 +184,7 @@ run(::tt::target::ttnn::EltwiseOp const *op, ::ttnn::Device &device,
     std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors,
     std::list<::ttnn::Tensor> &tensorPool) {
   switch (op->type()) {
+  /* Eltwise Binary */
   case ::tt::target::ttnn::EltwiseOpType::Add: {
     assert(op->ins()->size() == 2 && "Unsupported number of inputs");
     auto &lhs = *liveTensors.at(op->ins()->Get(0)->global_id());
@@ -100,8 +217,14 @@ run(::tt::target::ttnn::EltwiseOp const *op, ::ttnn::Device &device,
     liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
     break;
   }
-  default:
-    throw std::runtime_error("Unsupported elementwise operation type");
+  /* Eltwise Unary */
+  case ::tt::target::ttnn::EltwiseOpType::Relu: {
+    assert(op->ins()->size() == 1 && "Unsupported number of inputs");
+    ::ttnn::Tensor &in = *liveTensors.at(op->ins()->Get(0)->global_id());
+    tensorPool.push_back(::ttnn::relu(in));
+    liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    break;
+  }
   }
 }
 
@@ -124,6 +247,20 @@ run(::tt::target::ttnn::ReductionOp const *op, ::ttnn::Device &device,
     liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
     break;
   }
+  case ::tt::target::ttnn::ReductionOpType::Mean: {
+    auto &in = *liveTensors.at(op->in()->global_id());
+
+    const auto *dim_arg_fb_ptr = op->dim_arg();
+    std::optional<vector<int>> dim_arg =
+        dim_arg_fb_ptr ? std::make_optional(std::vector<int>(
+                             dim_arg_fb_ptr->begin(), dim_arg_fb_ptr->end()))
+                       : std::nullopt;
+
+    tensorPool.push_back(::ttnn::mean(in, dim_arg, op->keep_dim()));
+
+    liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    break;
+  }
   }
 }
 
diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
index 5b11a05310..07622c1c31 100644
--- a/runtime/lib/ttnn/runtime.cpp
+++ b/runtime/lib/ttnn/runtime.cpp
@@ -5,6 +5,7 @@
 #include "tt/runtime/runtime.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/utils.h"
+#include "utils.h"
 
 #include "ttmlir/Target/TTNN/Target.h"
 #include "ttmlir/Version.h"
@@ -71,7 +72,7 @@ std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc() {
   }
   uint8_t *buf = fbb.GetBufferPointer();
   auto size = fbb.GetSize();
-  auto handle = utils::malloc_shared(size);
+  auto handle = ::tt::runtime::utils::malloc_shared(size);
   std::memcpy(handle.get(), buf, size);
   ::ttnn::close_device(device);
   return std::make_pair(SystemDesc(handle), chipIds);
@@ -104,25 +105,6 @@ static BorrowedStorage createStorage(void *ptr, std::uint32_t numElements,
   }
 }
 
-static ::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) {
-  switch (dataType) {
-  case ::tt::target::DataType::Float32:
-    return ::ttnn::DataType::FLOAT32;
-  // case ::tt::target::DataType::Float16:
-  //   return ::ttnn::DataType::FLOAT16;
-  case ::tt::target::DataType::BFloat16:
-    return ::ttnn::DataType::BFLOAT16;
-  case ::tt::target::DataType::UInt32:
-    return ::ttnn::DataType::UINT32;
-  case ::tt::target::DataType::UInt16:
-    return ::ttnn::DataType::UINT16;
-  // case ::tt::target::DataType::UInt8:
-  //   return ::ttnn::DataType::UINT8;
-  default:
-    throw std::runtime_error("Unsupported data type");
-  }
-}
-
 Tensor createTensor(std::shared_ptr<void> data,
                     std::vector<std::uint32_t> const &shape,
                     std::vector<std::uint32_t> const &stride,
@@ -130,12 +112,14 @@ Tensor createTensor(std::shared_ptr<void> data,
   std::uint32_t numElements = shape[0] * stride[0];
   auto tensor = std::make_shared<::ttnn::Tensor>(
       createStorage(data.get(), numElements, dataType), shape,
-      toTTNNDataType(dataType), ::ttnn::Layout::ROW_MAJOR);
+      utils::toTTNNDataType(dataType), ::ttnn::Layout::ROW_MAJOR);
   return Tensor(tensor, data);
 }
 
-Device openDevice(std::vector<int> deviceIds) {
+Device openDevice(std::vector<int> const &deviceIds,
+                  std::vector<std::uint8_t> const &numHWCQs) {
   assert(deviceIds.size() == 1 && "Only one device is supported for now");
+  assert(numHWCQs.empty() && "HWCQs are not supported for now");
   auto &device = ::ttnn::open_device(deviceIds.front());
   return Device::borrow(device);
 }
@@ -175,6 +159,8 @@ Event submit(Device deviceHandle, Binary executableHandle,
   return Event(nullptr);
 }
 
-void wait(Event) { throw std::runtime_error("Not implemented"); }
+void wait(Event) {
+  // Not implemented
+}
 
 } // namespace tt::runtime::ttnn
diff --git a/runtime/lib/ttnn/utils.h b/runtime/lib/ttnn/utils.h
new file mode 100644
index 0000000000..235d9e446c
--- /dev/null
+++ b/runtime/lib/ttnn/utils.h
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_UTILS_H
+#define TTNN_RUNTIME_UTILS_H
+
+#include "ttmlir/Target/TTNN/Target.h"
+#include "ttnn/types.hpp"
+
+namespace tt::runtime::ttnn::utils {
+
+inline bool isValidTileShape(const ::tt::target::Dim2d *shape) {
+  return (shape->x() == 0 and shape->y() == 0) or
+         (shape->x() == 1 and shape->y() == 1) or
+         (shape->x() == 32 and shape->y() == 32);
+}
+
+inline ::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) {
+  switch (dataType) {
+  case ::tt::target::DataType::Float32:
+    return ::ttnn::DataType::FLOAT32;
+  case ::tt::target::DataType::BFloat16:
+    return ::ttnn::DataType::BFLOAT16;
+  case ::tt::target::DataType::BFP_BFloat8:
+    return ::ttnn::DataType::BFLOAT8_B;
+  case ::tt::target::DataType::BFP_BFloat4:
+    return ::ttnn::DataType::BFLOAT4_B;
+  case ::tt::target::DataType::UInt32:
+    return ::ttnn::DataType::UINT32;
+  case ::tt::target::DataType::UInt16:
+    return ::ttnn::DataType::UINT16;
+
+  default:
+    throw std::runtime_error("Unsupported data type");
+  }
+}
+
+} // namespace tt::runtime::ttnn::utils
+
+#endif
diff --git a/runtime/test/CMakeLists.txt b/runtime/test/CMakeLists.txt
index eb2abab9fd..584f1ab189 100644
--- a/runtime/test/CMakeLists.txt
+++ b/runtime/test/CMakeLists.txt
@@ -34,6 +34,7 @@ target_include_directories(TTRuntimeTEST INTERFACE
 target_link_libraries(TTRuntimeTEST INTERFACE
     TTEAGER_LIBRARY
     TTMETAL_LIBRARY
+    TTBinary
     TTRuntime
     TTRuntimeTTNN
     TTRuntimeTTMetal
diff --git a/runtime/test/ttnn/test_subtract.cpp b/runtime/test/ttnn/test_subtract.cpp
index 26af9f16fd..cd6369a9ed 100644
--- a/runtime/test/ttnn/test_subtract.cpp
+++ b/runtime/test/ttnn/test_subtract.cpp
@@ -1,7 +1,6 @@
 // SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/runtime.h"
 #include "tt/runtime/utils.h"
 #include <cstring>
diff --git a/runtime/tools/python/CMakeLists.txt b/runtime/tools/python/CMakeLists.txt
index b2f8e56e39..069e4a97ed 100644
--- a/runtime/tools/python/CMakeLists.txt
+++ b/runtime/tools/python/CMakeLists.txt
@@ -4,7 +4,7 @@ add_custom_target(ttrt-copy-files
 )
 
 add_custom_target(ttrt
-  COMMAND TTMLIR_ENABLE_RUNTIME=${TTMLIR_ENABLE_RUNTIME} TTMLIR_VERSION_MAJOR=${TTMLIR_VERSION_MAJOR} TTMLIR_VERSION_MINOR=${TTMLIR_VERSION_MINOR} TTMLIR_VERSION_PATCH=${TTMLIR_VERSION_PATCH} SOURCE_ROOT=${TTMLIR_SOURCE_DIR} python -m pip install .
+  COMMAND TTMLIR_ENABLE_RUNTIME=${TTMLIR_ENABLE_RUNTIME} TT_RUNTIME_ENABLE_TTNN=${TT_RUNTIME_ENABLE_TTNN} TT_RUNTIME_ENABLE_TTMETAL=${TT_RUNTIME_ENABLE_TTMETAL} TTMLIR_VERSION_MAJOR=${TTMLIR_VERSION_MAJOR} TTMLIR_VERSION_MINOR=${TTMLIR_VERSION_MINOR} TTMLIR_VERSION_PATCH=${TTMLIR_VERSION_PATCH} SOURCE_ROOT=${TTMLIR_SOURCE_DIR} python -m pip install .
   WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
   COMMENT "python ttrt package"
   DEPENDS ttrt-copy-files
diff --git a/runtime/tools/python/setup.py b/runtime/tools/python/setup.py
index 61a0f463fd..6d8ee50fc6 100644
--- a/runtime/tools/python/setup.py
+++ b/runtime/tools/python/setup.py
@@ -22,6 +22,8 @@
 
 os.environ["LDFLAGS"] = "-Wl,-rpath,'$ORIGIN'"
 enable_runtime = os.environ.get("TTMLIR_ENABLE_RUNTIME", "OFF") == "ON"
+enable_ttnn = os.environ.get("TT_RUNTIME_ENABLE_TTNN", "OFF") == "ON"
+enable_ttmetal = os.environ.get("TT_RUNTIME_ENABLE_TTMETAL", "OFF") == "ON"
 
 ext_modules = [
     Pybind11Extension(
@@ -33,7 +35,7 @@
             f"{src_dir}/build/include",
             f"{src_dir}/build/include/ttmlir/Target/Common",
         ],
-        libraries=["TTRuntime", "flatbuffers"],
+        libraries=["TTBinary", "flatbuffers"],
         library_dirs=[
             f"{src_dir}/build/runtime/lib",
             f"{toolchain}/lib",
@@ -42,10 +44,24 @@
     ),
 ]
 
+dylibs = []
+linklibs = []
+if enable_ttnn:
+    dylibs = ["_ttnn.so"]
+    linklibs = ["TTRuntimeTTNN", ":_ttnn.so"]
+elif enable_ttmetal:
+    assert enable_ttmetal
+    dylibs = ["libtt_metal.so"]
+    linklibs = ["TTRuntimeTTMetal", "tt_metal"]
+
 if enable_runtime:
-    shutil.copy(
-        f"{metallibdir}/_ttnn.so", f"{src_dir}/build/runtime/tools/python/ttrt/runtime"
-    )
+    assert enable_ttmetal or enable_ttnn, "At least one runtime must be enabled"
+
+    for dylib in dylibs:
+        shutil.copy(
+            f"{metallibdir}/{dylib}",
+            f"{src_dir}/build/runtime/tools/python/ttrt/runtime",
+        )
     ext_modules.append(
         Pybind11Extension(
             "ttrt.runtime._C",
@@ -56,9 +72,11 @@
                 f"{src_dir}/build/include",
                 f"{src_dir}/build/include/ttmlir/Target/Common",
             ],
-            libraries=["TTRuntime", "TTRuntimeTTNN", ":_ttnn.so", "flatbuffers"],
+            libraries=["TTRuntime"] + linklibs + ["flatbuffers"],
             library_dirs=[
                 f"{src_dir}/build/runtime/lib",
+                f"{src_dir}/build/runtime/lib/ttnn",
+                f"{src_dir}/build/runtime/lib/ttmetal",
                 f"{toolchain}/lib",
                 f"{metallibdir}",
             ],
@@ -76,12 +94,12 @@
     long_description="",
     ext_modules=ext_modules,
     cmdclass={"build_ext": build_ext},
-    packages=["ttrt", "ttrt.binary", "ttrt.runtime"],
+    packages=["ttrt", "ttrt.common", "ttrt.binary", "ttrt.runtime"],
     install_requires=["pybind11"],
     entry_points={
         "console_scripts": ["ttrt = ttrt:main"],
     },
-    package_data={"ttrt.runtime": [f"_ttnn.so"]},
+    package_data={"ttrt.runtime": dylibs},
     zip_safe=False,
     python_requires=">=3.7",
 )
diff --git a/runtime/tools/python/ttrt/__init__.py b/runtime/tools/python/ttrt/__init__.py
index ff4285a1d1..f7883c980f 100644
--- a/runtime/tools/python/ttrt/__init__.py
+++ b/runtime/tools/python/ttrt/__init__.py
@@ -2,210 +2,27 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import ttrt.binary
 import os
 import json
-
+import importlib.machinery
+import sys
+import signal
+import os
+import io
+import subprocess
+import time
+import socket
 from pkg_resources import get_distribution
+import sys
+import shutil
 
+import ttrt.binary
+from ttrt.common.api import read, run, query, perf
+from ttrt.common.util import read_actions
 
-def system_desc_as_dict(desc):
-    return json.loads(desc.as_json())
-
-
-if "LOGGER_LEVEL" not in os.environ:
-    os.environ["LOGGER_LEVEL"] = "FATAL"
-if "TT_METAL_LOGGER_LEVEL" not in os.environ:
-    os.environ["TT_METAL_LOGGER_LEVEL"] = "FATAL"
-
-
-def check_version(fb_version):
-    package_name = "ttrt"
-    try:
-        package_version = get_distribution(package_name).version
-    except Exception as e:
-        print(f"Error retrieving version: {e} for {package_name}")
-
-    assert (
-        package_version == fb_version
-    ), f"{package_name}=v{package_version} does not match flatbuffer=v{fb_version}"
-
-
-def mlir_sections(fbb):
-    d = ttrt.binary.as_dict(fbb)
-    for i, program in enumerate(d["programs"]):
-        if "debug_info" not in program:
-            print("// no debug info found for program:", program["name"])
-            continue
-        print(
-            f"// program[{i}]:",
-            program["name"],
-            "-",
-            program["debug_info"]["mlir"]["name"],
-        )
-        print(program["debug_info"]["mlir"]["source"], end="")
-
-
-def cpp_sections(fbb):
-    d = ttrt.binary.as_dict(fbb)
-    for i, program in enumerate(d["programs"]):
-        if "debug_info" not in program:
-            print("// no debug info found for program:", program["name"])
-            continue
-        print(f"// program[{i}]:", program["name"])
-        print(program["debug_info"]["cpp"], end="")
-
-
-def program_inputs(fbb):
-    d = ttrt.binary.as_dict(fbb)
-    for program in d["programs"]:
-        print("program:", program["name"])
-        print(json.dumps(program["inputs"], indent=2))
-
-
-def program_outputs(fbb):
-    d = ttrt.binary.as_dict(fbb)
-    for program in d["programs"]:
-        print("program:", program["name"])
-        print(json.dumps(program["outputs"], indent=2))
-
-
-read_actions = {
-    "all": lambda fbb: print(fbb.as_json()),
-    "version": lambda fbb: print(
-        f"Version: {fbb.version}\ntt-mlir git hash: {fbb.ttmlir_git_hash}"
-    ),
-    "system-desc": lambda fbb: print(
-        json.dumps(ttrt.binary.as_dict(fbb)["system_desc"], indent=2)
-    ),
-    "mlir": mlir_sections,
-    "cpp": cpp_sections,
-    "inputs": program_inputs,
-    "outputs": program_outputs,
-}
-
-
-def read(args):
-    fbb = ttrt.binary.load_from_path(args.binary)
-    check_version(fbb.version)
-    read_actions[args.section](fbb)
-
-
-def run(args):
-    import ttrt.runtime
-
-    try:
-        import torch
-    except ModuleNotFoundError:
-        raise ImportError(
-            "Error: torch required for offline run, please `pip install torch`"
-        )
-
-    def toDataType(dtype):
-        if dtype == torch.float32:
-            return ttrt.runtime.DataType.Float32
-        if dtype == torch.float16:
-            return ttrt.runtime.DataType.Float16
-        if dtype == torch.bfloat16:
-            return ttrt.runtime.DataType.BFloat16
-        if dtype == torch.uint32:
-            return ttrt.runtime.DataType.UInt32
-        if dtype == torch.uint16:
-            return ttrt.runtime.DataType.UInt16
-        if dtype == torch.uint8:
-            return ttrt.runtime.DataType.UInt8
-        raise ValueError(f"unsupported dtype: {dtype}")
-
-    def fromDataType(dtype):
-        if dtype == "Float32":
-            return torch.float32
-        if dtype == "Float16":
-            return torch.float16
-        if dtype == "BFloat16":
-            return torch.bfloat16
-        if dtype == "UInt32":
-            return torch.uint32
-        if dtype == "UInt16":
-            return torch.uint16
-        if dtype == "UInt8":
-            return torch.uint8
-        raise ValueError(f"unsupported dtype: {dtype}")
-
-    fbb = ttrt.binary.load_binary_from_path(args.binary)
-    check_version(fbb.version)
-    assert fbb.file_identifier == "TTNN", "Only TTNN binaries are supported"
-    d = ttrt.binary.as_dict(fbb)
-    assert args.program_index < len(d["programs"]), "args.program_index out of range"
-    program = d["programs"][args.program_index]
-    print(f"running program[{args.program_index}]:", program["name"])
-    torch_inputs = []
-    torch_outputs = []
-    for i in program["inputs"]:
-        torch_inputs.append(
-            torch.randn(
-                i["desc"]["shape"],
-                dtype=fromDataType(i["desc"]["layout"]["memory_desc"]["data_type"]),
-            )
-        )
-    for i in program["outputs"]:
-        torch_outputs.append(
-            torch.zeros(
-                i["desc"]["shape"],
-                dtype=fromDataType(i["desc"]["layout"]["memory_desc"]["data_type"]),
-            )
-        )
-
-    print("inputs:\n", torch_inputs)
-
-    inputs = []
-    outputs = []
-    for i in torch_inputs:
-        inputs.append(
-            ttrt.runtime.create_tensor(
-                i.data_ptr(),
-                list(i.shape),
-                list(i.stride()),
-                i.element_size(),
-                toDataType(i.dtype),
-            )
-        )
-
-    for i in torch_outputs:
-        outputs.append(
-            ttrt.runtime.create_tensor(
-                i.data_ptr(),
-                list(i.shape),
-                list(i.stride()),
-                i.element_size(),
-                toDataType(i.dtype),
-            )
-        )
-
-    system_desc, device_ids = ttrt.runtime.get_current_system_desc()
-    device = ttrt.runtime.open_device(device_ids)
-    ttrt.runtime.submit(device, fbb, 0, inputs, outputs)
-    print("outputs:\n", torch_outputs)
-    ttrt.runtime.close_device(device)
-
-
-def query(args):
-    import ttrt.runtime
-
-    if args.system_desc or args.system_desc_as_json:
-        print(ttrt.runtime.get_current_system_desc()[0].as_json())
-    if args.system_desc_as_dict:
-        print(system_desc_as_dict(ttrt.runtime.get_current_system_desc()[0]))
-    if args.save_system_desc:
-        desc = ttrt.runtime.get_current_system_desc()[0]
-        if args.save_system_desc:
-            file_name = args.save_system_desc
-        else:
-            d = system_desc_as_dict(desc)
-            file_name = d["product_identifier"] + ".ttsys"
-        desc.store(file_name)
-        print("system desc saved to:", file_name)
-
-
+#######################################################################################
+#######################################**MAIN**########################################
+#######################################################################################
 def main():
     import argparse
 
@@ -214,29 +31,66 @@ def main():
     )
     subparsers = parser.add_subparsers(required=True)
 
+    """
+    API: read
+    """
     read_parser = subparsers.add_parser(
         "read", help="read information from flatbuffer binary"
     )
     read_parser.add_argument(
-        "-s",
         "--section",
         default="all",
         choices=sorted(list(read_actions.keys())),
         help="output sections of the fb",
     )
+    read_parser.add_argument(
+        "--clean-artifacts",
+        action="store_true",
+        help="clean all artifacts from previous runs",
+    )
+    read_parser.add_argument(
+        "--save-artifacts",
+        action="store_true",
+        help="save all artifacts during run",
+    )
     read_parser.add_argument("binary", help="flatbuffer binary file")
     read_parser.set_defaults(func=read)
 
+    """
+    API: run
+    """
     run_parser = subparsers.add_parser("run", help="run a flatbuffer binary")
     run_parser.add_argument(
-        "-p",
         "--program-index",
-        default=0,
+        default="all",
         help="the program inside the fbb to run",
     )
+    run_parser.add_argument(
+        "--clean-artifacts",
+        action="store_true",
+        help="clean all artifacts from previous runs",
+    )
+    run_parser.add_argument(
+        "--loops",
+        default=1,
+        help="number of loops",
+    )
+    run_parser.add_argument(
+        "--save-artifacts",
+        action="store_true",
+        help="save all artifacts during run",
+    )
+    run_parser.add_argument(
+        "--seed",
+        default=0,
+        help="Seed for random number generator",
+    )
     run_parser.add_argument("binary", help="flatbuffer binary file")
     run_parser.set_defaults(func=run)
 
+    """
+    API: query
+    """
     query_parser = subparsers.add_parser(
         "query", help="query information about the current system"
     )
@@ -256,16 +110,70 @@ def main():
         help="print the system desc as python dict",
     )
     query_parser.add_argument(
-        "--save-system-desc",
-        nargs="?",
-        default="",
-        help="serialize a system desc for the current system to a file",
+        "--clean-artifacts",
+        action="store_true",
+        help="clean all artifacts from previous runs",
+    )
+    query_parser.add_argument(
+        "--save-artifacts",
+        action="store_true",
+        help="save all artifacts during run",
     )
     query_parser.set_defaults(func=query)
 
+    """
+    API: perf
+    """
+    perf_parser = subparsers.add_parser(
+        "perf", help="run performance trace and collect performance data"
+    )
+    perf_parser.add_argument(
+        "--program-index",
+        default="all",
+        help="the program inside the fbb to run",
+    )
+    perf_parser.add_argument(
+        "--device",
+        action="store_true",
+        help="collect performance trace on both host and device",
+    )
+    perf_parser.add_argument(
+        "--generate-params",
+        action="store_true",
+        help="generate json file of model parameters based off of perf csv file",
+    )
+    perf_parser.add_argument(
+        "--perf-csv",
+        default="",
+        help="perf csv file generated from performance run",
+    )
+    perf_parser.add_argument(
+        "--clean-artifacts",
+        action="store_true",
+        help="clean all artifacts from previous runs",
+    )
+    perf_parser.add_argument(
+        "--loops",
+        default=1,
+        help="number of loops",
+    )
+    perf_parser.add_argument(
+        "--save-artifacts",
+        action="store_true",
+        help="save all artifacts during run",
+    )
+    perf_parser.add_argument("binary", help="flatbuffer binary file")
+    perf_parser.set_defaults(func=perf)
+
     try:
         args = parser.parse_args()
     except:
         parser.print_help()
         return
+
+    # run command
     args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/runtime/tools/python/ttrt/common/__init__.py b/runtime/tools/python/ttrt/common/__init__.py
new file mode 100644
index 0000000000..e0dd18e45a
--- /dev/null
+++ b/runtime/tools/python/ttrt/common/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/runtime/tools/python/ttrt/common/api.py b/runtime/tools/python/ttrt/common/api.py
new file mode 100644
index 0000000000..d475edd30d
--- /dev/null
+++ b/runtime/tools/python/ttrt/common/api.py
@@ -0,0 +1,404 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import json
+import importlib.machinery
+import sys
+import signal
+import os
+import io
+import subprocess
+import time
+import socket
+from pkg_resources import get_distribution
+import sys
+import shutil
+import atexit
+
+import ttrt.binary
+from ttrt.common.util import *
+
+#######################################################################################
+########################################**API**########################################
+#######################################################################################
+"""
+API: read
+  - read contents of flatbuffer file
+"""
+
+
+def read(args):
+    # initialization
+    binaries = []
+    fbb_list = []
+
+    # acquire parameters
+    arg_binary = args.binary
+    arg_clean_artifacts = args.clean_artifacts
+    arg_save_artifacts = args.save_artifacts
+    arg_section = args.section
+
+    # preprocessing
+    if os.path.isdir(arg_binary):
+        print("provided directory of flatbuffers")
+        binaries = find_ttnn_files(arg_binary)
+    else:
+        binaries.append(arg_binary)
+
+    if arg_clean_artifacts:
+        print("cleaning artifacts")
+        clean_artifacts()
+
+    if arg_save_artifacts:
+        print("setting up artifact directories")
+        setup_artifacts(binaries)
+
+    # constraint checking
+    print("executing constraint for all provided flatbuffers")
+    for binary in binaries:
+        check_file_exists(binary)
+        fbb = ttrt.binary.load_from_path(binary)
+        check_version(fbb.version)
+        fbb_list.append(fbb)
+
+    # execution
+    print("executing action for all provided flatbuffers")
+    for fbb in fbb_list:
+        read_actions[arg_section](fbb)
+
+    # save artifacts
+    if arg_save_artifacts:
+        print("saving artifacts")
+        for binary in binaries:
+            copy_ttnn_binary_into_artifact(binary)
+
+
+"""
+API: run
+  - run flatbuffer on device
+"""
+
+
+def run(args):
+    import ttrt.runtime
+    import torch
+
+    # initialization
+    binaries = []
+    fbb_list = []
+    torch_inputs = {}
+    torch_outputs = {}
+    system_desc = None
+
+    # acquire parameters
+    arg_binary = args.binary
+    arg_program_index = args.program_index
+    arg_clean_artifacts = args.clean_artifacts
+    arg_loops = int(args.loops)
+    arg_save_artifacts = args.save_artifacts
+
+    # preprocessing
+    if os.path.isdir(arg_binary):
+        print("provided directory of flatbuffers")
+        binaries = find_ttnn_files(arg_binary)
+    else:
+        binaries.append(arg_binary)
+
+    if arg_clean_artifacts:
+        print("cleaning artifacts")
+        clean_artifacts()
+
+    if arg_save_artifacts:
+        print("setting up artifact directories")
+        setup_artifacts(binaries)
+
+    # constraint checking
+    print("executing constraint for all provided flatbuffers")
+    system_desc, device_ids = ttrt.runtime.get_current_system_desc()
+    program_indices = []
+    for binary in binaries:
+        check_file_exists(binary)
+        fbb = ttrt.binary.load_binary_from_path(binary)
+        check_version(fbb.version)
+        fbb_dict = ttrt.binary.as_dict(fbb)
+
+        assert (
+            fbb_dict["system_desc"] == system_desc_as_dict(system_desc)["system_desc"]
+        ), f"system descriptor for binary and system mismatch!"
+
+        if arg_program_index != "all":
+            program_index = int(arg_program_index)
+            assert program_index < len(
+                fbb_dict["programs"]
+            ), "args.program_index out of range"
+            program_indices.append(program_index)
+        else:
+            program_indices = [i for i in range(len(fbb_dict["programs"]))]
+
+        fbb_list.append(
+            (
+                os.path.splitext(os.path.basename(binary))[0],
+                fbb,
+                fbb_dict,
+                program_indices,
+            )
+        )
+
+    # execution
+    print("executing action for all provided flatbuffers")
+    device = ttrt.runtime.open_device(device_ids)
+    atexit.register(lambda: ttrt.runtime.close_device(device))
+
+    torch.manual_seed(args.seed)
+
+    for (binary_name, fbb, fbb_dict, program_indices) in fbb_list:
+        torch_inputs[binary_name] = {}
+        torch_outputs[binary_name] = {}
+
+        for program_index in program_indices:
+            torch_inputs[binary_name][program_index] = []
+            torch_outputs[binary_name][program_index] = []
+
+            program = fbb_dict["programs"][program_index]
+            print(
+                f"running binary={binary_name} with program[{program_index}]:",
+                program["name"],
+            )
+
+            for i in program["inputs"]:
+                torch_tensor = torch.randn(
+                    i["desc"]["shape"],
+                    dtype=fromDataType(i["desc"]["layout"]["memory_desc"]["data_type"]),
+                )
+                torch_inputs[binary_name][program_index].append(torch_tensor)
+            for i in program["outputs"]:
+                torch_tensor = torch.zeros(
+                    i["desc"]["shape"],
+                    dtype=fromDataType(i["desc"]["layout"]["memory_desc"]["data_type"]),
+                )
+                torch_outputs[binary_name][program_index].append(torch_tensor)
+
+            print("inputs:\n", torch_inputs)
+
+            total_inputs = []
+            total_outputs = []
+            for loop in range(arg_loops):
+                inputs = []
+                outputs = []
+                for i in torch_inputs[binary_name][program_index]:
+                    inputs.append(
+                        ttrt.runtime.create_tensor(
+                            i.data_ptr(),
+                            list(i.shape),
+                            list(i.stride()),
+                            i.element_size(),
+                            toDataType(i.dtype),
+                        )
+                    )
+
+                for i in torch_outputs[binary_name][program_index]:
+                    outputs.append(
+                        ttrt.runtime.create_tensor(
+                            i.data_ptr(),
+                            list(i.shape),
+                            list(i.stride()),
+                            i.element_size(),
+                            toDataType(i.dtype),
+                        )
+                    )
+
+                total_inputs.append(inputs)
+                total_outputs.append(outputs)
+
+            for loop in range(arg_loops):
+                ttrt.runtime.submit(
+                    device, fbb, program_index, total_inputs[loop], total_outputs[loop]
+                )
+                print(f"finished loop={loop}")
+            print("outputs:\n", torch_outputs)
+
+    # save artifacts
+    if arg_save_artifacts:
+        print("saving artifacts")
+        for binary in binaries:
+            fbb_dict = ttrt.binary.as_dict(ttrt.binary.load_binary_from_path(binary))
+            curr_program_indices = []
+
+            if arg_program_index != "all":
+                curr_program_indices.append(int(arg_program_index))
+            else:
+                curr_program_indices = [i for i in range(len(fbb_dict["programs"]))]
+
+            for program_index in curr_program_indices:
+                copy_ttnn_binary_into_artifact(binary)
+                binary_name = os.path.splitext(os.path.basename(binary))[0]
+                torch_input_tensors = torch_inputs[binary_name][program_index]
+                torch_output_tensors = torch_outputs[binary_name][program_index]
+
+                for i, input in enumerate(torch_input_tensors):
+                    save_torch_tensor_into_ttrt_artifacts(
+                        input, f"{binary_name}/program_{program_index}_input_{i}.pt"
+                    )
+
+                for i, output in enumerate(torch_output_tensors):
+                    save_torch_tensor_into_ttrt_artifacts(
+                        output, f"{binary_name}/program_{program_index}_output_{i}.pt"
+                    )
+
+                save_system_desc_into_ttrt_artifacts(
+                    system_desc, f"{binary_name}/system_desc.ttsys"
+                )
+
+
+"""
+API: query
+  - query device for system descriptor in the form of a flatbuffer
+"""
+
+
+def query(args):
+    import ttrt.runtime
+
+    # initialization
+    system_desc = None
+
+    # acquire parameters
+    arg_system_desc = args.system_desc
+    arg_system_desc_as_json = args.system_desc_as_json
+    arg_system_desc_as_dict = args.system_desc_as_dict
+    arg_clean_artifacts = args.clean_artifacts
+    arg_save_artifacts = args.save_artifacts
+
+    # preprocessing
+    if arg_clean_artifacts:
+        print("cleaning artifacts")
+        clean_artifacts()
+
+    if arg_save_artifacts:
+        print("setting up artifact directories")
+        setup_artifacts()
+
+    # execution
+    print("executing action to get system desc")
+    system_desc = ttrt.runtime.get_current_system_desc()[0]
+
+    if arg_system_desc or arg_system_desc_as_json:
+        print(system_desc.as_json())
+    if arg_system_desc_as_dict:
+        print(system_desc_as_dict(system_desc))
+
+    # save artifacts
+    if arg_save_artifacts:
+        print("saving artifacts")
+        save_system_desc_into_ttrt_artifacts(system_desc, "system_desc.ttsys")
+
+
+"""
+API: perf
+  - run flatbuffer on device in performance mode
+"""
+
+
+def perf(args):
+    import ttrt.common.perf_trace as perf_trace
+
+    # initialization
+    binaries = []
+
+    # acquire parameters
+    arg_binary = args.binary
+    arg_program_index = args.program_index
+    arg_clean_artifacts = args.clean_artifacts
+    arg_perf_csv = args.perf_csv
+    arg_loops = int(args.loops)
+    arg_save_artifacts = args.save_artifacts
+    arg_device = args.device
+    arg_generate_params = args.generate_params
+
+    # preprocessing
+    if os.path.isdir(arg_binary):
+        print("provided directory of flatbuffers")
+        binaries = find_ttnn_files(arg_binary)
+    else:
+        binaries.append(arg_binary)
+
+    if arg_clean_artifacts:
+        print("cleaning artifacts")
+        clean_artifacts()
+
+    if arg_save_artifacts:
+        print("setting up artifact directories")
+        setup_artifacts(binaries)
+
+    # constraint checking
+    if arg_generate_params:
+        check_file_exists(arg_perf_csv)
+
+    for binary in binaries:
+        check_file_exists(binary)
+
+    # execution
+    if arg_generate_params:
+        perf_trace.generate_params_dict(arg_perf_csv)
+    else:
+        for binary in binaries:
+            # get available port for tracy client and server to communicate on
+            port = perf_trace.get_available_port()
+
+            if not port:
+                print("No available port found")
+                sys.exit(1)
+            print(f"Using port {port}")
+
+            # setup environment flags
+            envVars = dict(os.environ)
+            envVars["TRACY_PORT"] = port
+            envVars["TT_METAL_DEVICE_PROFILER_DISPATCH"] = "0"
+
+            if arg_device:
+                envVars["TT_METAL_DEVICE_PROFILER"] = "1"
+
+            # run perf artifact setup
+            captureProcess = perf_trace.run_perf_artifact_setup(port)
+
+            # generate test command to execute
+            testCommandOptions = ""
+            if arg_save_artifacts:
+                testCommandOptions += f"--save-artifacts "
+
+            testCommand = f"python -m tracy -p {TTMLIR_VENV_DIR}/bin/ttrt run {binary} --loops {arg_loops} --program-index {arg_program_index} {testCommandOptions}"
+            testProcess = subprocess.Popen(
+                [testCommand], shell=True, env=envVars, preexec_fn=os.setsid
+            )
+            print(f"Test process started")
+
+            # setup multiprocess signal handler
+            def signal_handler(sig, frame):
+                os.killpg(os.getpgid(testProcess.pid), signal.SIGTERM)
+                captureProcess.terminate()
+                captureProcess.communicate()
+                sys.exit(3)
+
+            signal.signal(signal.SIGINT, signal_handler)
+            signal.signal(signal.SIGTERM, signal_handler)
+
+            testProcess.communicate()
+
+            binary_name = os.path.splitext(os.path.basename(binary))[0]
+            binary_perf_folder = f"{TTRT_ARTIFACTS}/{binary_name}/perf"
+
+            try:
+                captureProcess.communicate(timeout=15)
+                perf_trace.generate_report(binary_perf_folder)
+            except subprocess.TimeoutExpired as e:
+                captureProcess.terminate()
+                captureProcess.communicate()
+                print(
+                    f"No profiling data could be captured. Please make sure you are on the correct build. Use scripts/build_scripts/build_with_profiler_opt.sh to build if you are not sure."
+                )
+                sys.exit(1)
+
+            # save artifacts
+            perf_trace.save_perf_artifacts(binary_perf_folder)
diff --git a/runtime/tools/python/ttrt/common/perf_trace.py b/runtime/tools/python/ttrt/common/perf_trace.py
new file mode 100644
index 0000000000..fc91d6d487
--- /dev/null
+++ b/runtime/tools/python/ttrt/common/perf_trace.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ttrt.binary
+import os
+import json
+import importlib.machinery
+import sys
+import signal
+import os
+import io
+import subprocess
+import time
+import socket
+from pkg_resources import get_distribution
+import sys
+import shutil
+
+from ttrt.common.util import *
+import tracy
+import tracy_state
+from tt_metal.tools.profiler.process_ops_logs import process_ops
+
+from tt_metal.tools.profiler.common import (
+    PROFILER_LOGS_DIR,
+    TRACY_FILE_NAME,
+    TRACY_OPS_TIMES_FILE_NAME,
+    TRACY_OPS_DATA_FILE_NAME,
+    PROFILER_DEVICE_SIDE_LOG,
+    PROFILER_HOST_DEVICE_SYNC_INFO,
+)
+
+#######################################################################################
+######################################**GLOBALS**######################################
+#######################################################################################
+TRACY_CAPTURE_TOOL = f"{TT_MLIR_HOME}/third_party/tt-metal/src/tt-metal-build/tools/profiler/bin/capture-release"
+TRACY_CSVEXPROT_TOOL = f"{TT_MLIR_HOME}/third_party/tt-metal/src/tt-metal-build/tools/profiler/bin/csvexport-release"
+
+#######################################################################################
+#####################################**PERF-UTILS**####################################
+#######################################################################################
+def get_available_port():
+    ip = socket.gethostbyname(socket.gethostname())
+
+    for port in range(8086, 8500):
+        try:
+            serv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            serv.bind((ip, port))
+            return str(port)
+        except PermissionError as e:
+            pass
+        except OSError as e:
+            pass
+    return None
+
+
+def run_perf_artifact_setup(port):
+    subprocess.run(
+        f"rm -rf {PROFILER_LOGS_DIR}; mkdir -p {PROFILER_LOGS_DIR}",
+        shell=True,
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+
+    print("Verifying tracy profiling tools")
+    check_file_exists(TRACY_CAPTURE_TOOL)
+    check_file_exists(TRACY_CSVEXPROT_TOOL)
+
+    captureProcess = None
+    captureCommand = (
+        f"{TRACY_CAPTURE_TOOL} -o {PROFILER_LOGS_DIR / TRACY_FILE_NAME} -f -p {port}"
+    )
+    print(f"Capture command: {captureCommand}")
+    captureProcess = subprocess.Popen(captureCommand, shell=True)
+
+    return captureProcess
+
+
+def generate_report(binary_perf_folder):
+    child_calls = ["CompileProgram", "HWCommandQueue_write_buffer"]
+    timeOut = 15
+    timeCount = 0
+    while not os.path.exists(PROFILER_LOGS_DIR / TRACY_FILE_NAME):
+        print(f"tracy capture out not found, will try again in 1 second")
+        if timeCount > timeOut:
+            print(
+                f"tracy capture output file {PROFILER_LOGS_DIR / TRACY_FILE_NAME} was not generated"
+            )
+            sys.exit(1)
+        timeCount += 1
+        time.sleep(1)
+
+    with open(PROFILER_LOGS_DIR / TRACY_OPS_TIMES_FILE_NAME, "w") as csvFile:
+        child_call_str = f"-x {','.join(child_calls)}"
+        subprocess.run(
+            f"{TRACY_CSVEXPROT_TOOL} -u -p TT_DNN {child_call_str} {PROFILER_LOGS_DIR / TRACY_FILE_NAME}",
+            shell=True,
+            check=True,
+            stdout=csvFile,
+            stderr=subprocess.DEVNULL,
+        )
+
+    print(
+        f"Host side ops time report generated at {PROFILER_LOGS_DIR / TRACY_OPS_TIMES_FILE_NAME}"
+    )
+
+    with open(PROFILER_LOGS_DIR / TRACY_OPS_DATA_FILE_NAME, "w") as csvFile:
+        subprocess.run(
+            f'{TRACY_CSVEXPROT_TOOL} -m -s ";" {PROFILER_LOGS_DIR / TRACY_FILE_NAME}',
+            shell=True,
+            check=True,
+            stdout=csvFile,
+            stderr=subprocess.DEVNULL,
+        )
+
+    print(
+        f"Host side ops data report generated at {PROFILER_LOGS_DIR / TRACY_OPS_DATA_FILE_NAME}"
+    )
+    process_ops(binary_perf_folder, "", True)
+
+
+def generate_params_dict(perf_csv_file):
+    # TAPS: TODO - need this support for UIDs
+    import pandas as pd
+    import json
+
+    # Load the CSV file into a DataFrame
+    df = pd.read_csv(perf_csv_file)
+
+    # Convert each row to a JSON string
+    json_rows = df.apply(lambda row: row.to_json(), axis=1).tolist()
+
+    # Print or process the JSON strings
+    for json_str in json_rows:
+        print(json_str)
+
+
+def save_perf_artifacts(perf_folder):
+    profiler_device_side_log_file = (
+        f"{TT_METAL_HOME}/generated/profiler/.logs/{PROFILER_DEVICE_SIDE_LOG}"
+    )
+    profiler_host_device_sync_info_file = (
+        f"{TT_METAL_HOME}/generated/profiler/.logs/{PROFILER_HOST_DEVICE_SYNC_INFO}"
+    )
+    profiler_log_location_record_file = (
+        f"{TT_METAL_HOME}/generated/profiler/.logs/.locations.log"
+    )
+    tracy_ops_times_file = (
+        f"{TT_METAL_HOME}/generated/profiler/.logs/{TRACY_OPS_TIMES_FILE_NAME}"
+    )
+    tracy_ops_data_file = (
+        f"{TT_METAL_HOME}/generated/profiler/.logs/{TRACY_OPS_DATA_FILE_NAME}"
+    )
+    tracy_file = f"{TT_METAL_HOME}/generated/profiler/.logs/{TRACY_FILE_NAME}"
+
+    try:
+        # check_file_exists(profiler_device_side_log_file)
+        # check_file_exists(profiler_host_device_sync_info_file)
+        # check_file_exists(profiler_log_location_record_file)
+        check_file_exists(tracy_ops_times_file)
+        check_file_exists(tracy_ops_data_file)
+        check_file_exists(tracy_file)
+
+        # shutil.copy(profiler_device_side_log_file, os.path.join(perf_folder, "profiler_device_side_log_file.csv"))
+        # print(f"File '{profiler_device_side_log_file}' copied to '{perf_folder}' successfully.")
+
+        # shutil.copy(profiler_host_device_sync_info_file, os.path.join(perf_folder, "profiler_host_device_sync_info_file.csv"))
+        # print(f"File '{profiler_host_device_sync_info_file}' copied to '{perf_folder}' successfully.")
+
+        # shutil.copy(profiler_log_location_record_file, os.path.join(perf_folder, "profiler_log_location_record_file.log"))
+        # print(f"File '{profiler_log_location_record_file}' copied to '{perf_folder}' successfully.")
+
+        shutil.copy(
+            tracy_ops_times_file, os.path.join(perf_folder, "tracy_ops_times_file.csv")
+        )
+        print(f"File '{tracy_ops_times_file}' copied to '{perf_folder}' successfully.")
+
+        shutil.copy(
+            tracy_ops_data_file, os.path.join(perf_folder, "tracy_ops_times_file.csv")
+        )
+        print(f"File '{tracy_ops_data_file}' copied to '{perf_folder}' successfully.")
+
+        shutil.copy(tracy_file, os.path.join(perf_folder, "tracy_file.tracy"))
+        print(f"File '{tracy_file}' copied to '{perf_folder}' successfully.")
+
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
diff --git a/runtime/tools/python/ttrt/common/util.py b/runtime/tools/python/ttrt/common/util.py
new file mode 100644
index 0000000000..97fa914598
--- /dev/null
+++ b/runtime/tools/python/ttrt/common/util.py
@@ -0,0 +1,226 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ttrt.binary
+import os
+import json
+import importlib.machinery
+import sys
+import signal
+import os
+import io
+import subprocess
+import time
+import socket
+from pkg_resources import get_distribution
+import sys
+import shutil
+
+#######################################################################################
+######################################**GLOBALS**######################################
+#######################################################################################
+TT_MLIR_HOME = os.environ.get("TT_MLIR_HOME", f"{os.getcwd()}")
+TTMLIR_VENV_DIR = os.environ.get("TTMLIR_VENV_DIR", "/opt/ttmlir-toolchain/venv")
+TT_METAL_HOME = os.environ.get("TT_METAL_HOME", "third_party/tt-metal/src/tt-metal")
+TTRT_ARTIFACTS = f"{TT_MLIR_HOME}/ttrt-artifacts"
+
+if "LOGGER_LEVEL" not in os.environ:
+    os.environ["LOGGER_LEVEL"] = "FATAL"
+if "TT_METAL_LOGGER_LEVEL" not in os.environ:
+    os.environ["TT_METAL_LOGGER_LEVEL"] = "FATAL"
+
+#######################################################################################
+#######################################**UTILS**#######################################
+#######################################################################################
+def clean_artifacts():
+    subprocess.run(
+        f"rm -rf {TTRT_ARTIFACTS}",
+        shell=True,
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+
+
+def setup_artifacts(binaries=[]):
+    if not os.path.exists(TTRT_ARTIFACTS):
+        subprocess.run(
+            f"mkdir -p {TTRT_ARTIFACTS}",
+            shell=True,
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+
+    for binary in binaries:
+        name = os.path.splitext(os.path.basename(binary))[0]
+        subprocess.run(
+            f"mkdir -p {TTRT_ARTIFACTS}/{name}; mkdir -p {TTRT_ARTIFACTS}/{name}/perf",
+            shell=True,
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+
+
+def copy_ttnn_binary_into_artifact(file_path):
+    try:
+        if not os.path.isfile(file_path):
+            raise FileNotFoundError(f"Source file '{file_path}' does not exist.")
+
+        name = os.path.splitext(os.path.basename(file_path))[0]
+        shutil.copy(file_path, f"{TTRT_ARTIFACTS}/{name}")
+        print(f"File '{file_path}' copied to '{TTRT_ARTIFACTS}/{name}' successfully.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+
+def save_torch_tensor_into_ttrt_artifacts(torch_tensor, file_path):
+    import torch
+
+    try:
+        torch.save(torch_tensor, f"{TTRT_ARTIFACTS}/{file_path}")
+        print(
+            f"File '{file_path}' saved to '{TTRT_ARTIFACTS}/{file_path}' successfully."
+        )
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+
+def save_system_desc_into_ttrt_artifacts(system_desc, file_path):
+    try:
+        system_desc.store(f"{TTRT_ARTIFACTS}/{file_path}")
+        print(
+            f"File '{file_path}' saved to '{TTRT_ARTIFACTS}/{file_path}' successfully."
+        )
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+
+def check_file_exists(file_path):
+    try:
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Required file '{file_path}' does not exist.")
+    except FileNotFoundError as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        sys.exit(1)
+
+
+def system_desc_as_dict(desc):
+    return json.loads(desc.as_json())
+
+
+def check_version(fb_version):
+    package_name = "ttrt"
+    try:
+        package_version = get_distribution(package_name).version
+    except Exception as e:
+        print(f"Error retrieving version: {e} for {package_name}")
+
+    assert (
+        package_version == fb_version
+    ), f"{package_name}=v{package_version} does not match flatbuffer=v{fb_version}"
+
+
+def mlir_sections(fbb):
+    d = ttrt.binary.as_dict(fbb)
+    for i, program in enumerate(d["programs"]):
+        if "debug_info" not in program:
+            print("// no debug info found for program:", program["name"])
+            continue
+        print(
+            f"// program[{i}]:",
+            program["name"],
+            "-",
+            program["debug_info"]["mlir"]["name"],
+        )
+        print(program["debug_info"]["mlir"]["source"], end="")
+
+
+def cpp_sections(fbb):
+    d = ttrt.binary.as_dict(fbb)
+    for i, program in enumerate(d["programs"]):
+        if "debug_info" not in program:
+            print("// no debug info found for program:", program["name"])
+            continue
+        print(f"// program[{i}]:", program["name"])
+        print(program["debug_info"]["cpp"], end="")
+
+
+def program_inputs(fbb):
+    d = ttrt.binary.as_dict(fbb)
+    for program in d["programs"]:
+        print("program:", program["name"])
+        print(json.dumps(program["inputs"], indent=2))
+
+
+def program_outputs(fbb):
+    d = ttrt.binary.as_dict(fbb)
+    for program in d["programs"]:
+        print("program:", program["name"])
+        print(json.dumps(program["outputs"], indent=2))
+
+
+read_actions = {
+    "all": lambda fbb: print(fbb.as_json()),
+    "version": lambda fbb: print(
+        f"Version: {fbb.version}\ntt-mlir git hash: {fbb.ttmlir_git_hash}"
+    ),
+    "system-desc": lambda fbb: print(
+        json.dumps(ttrt.binary.as_dict(fbb)["system_desc"], indent=2)
+    ),
+    "mlir": mlir_sections,
+    "cpp": cpp_sections,
+    "inputs": program_inputs,
+    "outputs": program_outputs,
+}
+
+
+def find_ttnn_files(directory):
+    print("finding all ttnn files")
+    ttnn_files = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".ttnn"):
+                ttnn_files.append(os.path.join(root, file))
+    return ttnn_files
+
+
+def toDataType(dtype):
+    import torch
+
+    if dtype == torch.float32:
+        return ttrt.runtime.DataType.Float32
+    if dtype == torch.float16:
+        return ttrt.runtime.DataType.Float16
+    if dtype == torch.bfloat16:
+        return ttrt.runtime.DataType.BFloat16
+    if dtype == torch.uint32:
+        return ttrt.runtime.DataType.UInt32
+    if dtype == torch.uint16:
+        return ttrt.runtime.DataType.UInt16
+    if dtype == torch.uint8:
+        return ttrt.runtime.DataType.UInt8
+    raise ValueError(f"unsupported dtype: {dtype}")
+
+
+def fromDataType(dtype):
+    import torch
+
+    if dtype == "Float32":
+        return torch.float32
+    if dtype == "Float16":
+        return torch.float16
+    if dtype == "BFloat16":
+        return torch.bfloat16
+    if dtype == "UInt32":
+        return torch.uint32
+    if dtype == "UInt16":
+        return torch.uint16
+    if dtype == "UInt8":
+        return torch.uint8
+    raise ValueError(f"unsupported dtype: {dtype}")
diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp
index 2cb5abbbb9..1a383079c0 100644
--- a/runtime/tools/python/ttrt/runtime/module.cpp
+++ b/runtime/tools/python/ttrt/runtime/module.cpp
@@ -45,6 +45,7 @@ PYBIND11_MODULE(_C, m) {
       "Create a tensor with borrowed memory");
   m.def("open_device", &tt::runtime::openDevice,
         py::arg("device_ids") = std::vector<int>{0},
+        py::arg("num_hw_cqs") = std::vector<std::uint8_t>{},
         "Open a device for execution");
   m.def("close_device", &tt::runtime::closeDevice, "Close a device");
   m.def("submit", &tt::runtime::submit, py::arg("device"),
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index 7e8fe6324d..4b77f8c176 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -56,10 +56,7 @@
 llvm_config.with_environment("PATH", config.llvm_tools_dir, append_path=True)
 
 tool_dirs = [config.ttmlir_tools_dir, config.llvm_tools_dir]
-tools = [
-    "mlir-opt",
-    "ttmlir-opt",
-]
+tools = ["mlir-opt", "ttmlir-opt", "ttmlir-translate"]
 
 llvm_config.add_tool_substitutions(tools, tool_dirs)
 
diff --git a/test/python/device_attr.py b/test/python/device_attr.py
new file mode 100644
index 0000000000..590fae1471
--- /dev/null
+++ b/test/python/device_attr.py
@@ -0,0 +1,159 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# RUN: %python %s | FileCheck %s
+
+from ttmlir.ir import *
+from ttmlir.dialects import tt
+
+ctx = Context()
+tt.register_dialect(ctx)
+
+
+def updiv(n, d):
+    return (n + d - 1) // d
+
+
+def volume(shape):
+    vol = 1
+    for dim in shape:
+        vol *= dim
+    return vol
+
+
+def getTotalDevices(grid, physicalGrid=[8, 8]):
+    return volume(grid) // volume(physicalGrid)
+
+
+def inferAffineMap(grid, physicalGrid=[8, 8]):
+    assert len(grid) >= 2
+    mesh = grid[:-2] + [
+        updiv(grid[-2], physicalGrid[-2]),
+        updiv(grid[-1], physicalGrid[-1]),
+    ]
+    totalDevices = getTotalDevices(grid, physicalGrid=physicalGrid)
+    p0 = AffineConstantExpr.get(physicalGrid[0], ctx)
+    p1 = AffineConstantExpr.get(physicalGrid[1], ctx)
+    dZ = AffineConstantExpr.get(0, ctx)
+    dY = AffineDimExpr.get(len(grid) - 2, ctx)
+    dX = AffineDimExpr.get(len(grid) - 1, ctx)
+    if totalDevices > 1:
+        v = mesh[-1]
+        dZ = AffineFloorDivExpr.get(dY, p0) * v + AffineFloorDivExpr.get(dX, p1)
+        for d in range(len(mesh) - 3, -1, -1):
+            v *= mesh[d + 1]
+            dn = AffineDimExpr.get(d, ctx)
+            dZ = dn * v + dZ
+    exprs = [
+        dZ,
+        dY % p0 if mesh[-2] > 1 else dY,
+        dX % p1 if mesh[-1] > 1 else dX,
+    ]
+    return AffineMap.get(len(grid), 0, exprs, ctx)
+
+
+def createDeviceAttr(grid, physicalGrid=[8, 8], deviceStartIdx=0, affMap=None):
+    totalDevices = getTotalDevices(grid, physicalGrid=physicalGrid)
+    affineMap = affMap if affMap is not None else inferAffineMap(grid, physicalGrid)
+    return tt.ir.DeviceAttr.get(
+        ctx, grid, affineMap, list(range(deviceStartIdx, deviceStartIdx + totalDevices))
+    )
+
+
+c = lambda v: AffineConstantExpr.get(v, ctx)
+d = lambda v: AffineDimExpr.get(v, ctx)
+amap = lambda ndims, results: AffineMap.get(ndims, 0, results, ctx)
+floordiv = AffineFloorDivExpr.get
+
+c0 = c(0)
+d0 = d(0)
+d1 = d(1)
+d2 = d(2)
+
+# ------------------------------------------------------------------------------
+
+print("=== Simple single device ===")
+# CHECK: tt.device<#tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, [0]>
+print("", createDeviceAttr([8, 8]))
+
+# ------------------------------------------------------------------------------
+
+print("\n=== Data parallel over batch ===")
+# CHECK: tt.device<#tt.grid<2x8x8, (d0, d1, d2) -> (d0 + d1 floordiv 8 + d2 floordiv 8, d1, d2)>, [0, 1]>
+print("divide batch by 2\n", createDeviceAttr([2, 8, 8]))
+# CHECK: tt.device<#tt.grid<4x8x8, (d0, d1, d2) -> (d0 + d1 floordiv 8 + d2 floordiv 8, d1, d2)>, [0, 1, 2, 3]>
+print("divide batch by 4\n", createDeviceAttr([4, 8, 8]))
+
+# ------------------------------------------------------------------------------
+
+print("\n=== Data parallel over 2d ===")
+# CHECK: tt.device<#tt.grid<8x16, (d0, d1) -> ((d0 floordiv 8) * 2 + d1 floordiv 8, d0, d1 mod 8)>, [0, 1]>
+print(
+    "Reinterpret 2 devices as grid side by side, 1x2 mesh\n", createDeviceAttr([8, 16])
+)
+# CHECK: tt.device<#tt.grid<16x8, (d0, d1) -> (d0 floordiv 8 + d1 floordiv 8, d0 mod 8, d1)>, [0, 1]>
+print(
+    "Reinterpret 2 devices as grid top to bottom, 2x1 mesh\n", createDeviceAttr([16, 8])
+)
+# CHECK: tt.device<#tt.grid<16x32, (d0, d1) -> ((d0 floordiv 8) * 4 + d1 floordiv 8, d0 mod 8, d1 mod 8)>, [0, 1, 2, 3, 4, 5, 6, 7]>
+print("8 devices 2x4 mesh\n", createDeviceAttr([16, 32]))
+# CHECK: tt.device<#tt.grid<32x16, (d0, d1) -> ((d0 floordiv 8) * 2 + d1 floordiv 8, d0 mod 8, d1 mod 8)>, [0, 1, 2, 3, 4, 5, 6, 7]>
+print("8 devices 4x2 mesh\n", createDeviceAttr([32, 16]))
+
+# ------------------------------------------------------------------------------
+
+print("\n=== Data parallel over 2d and batch (3d) ===")
+# CHECK: tt.device<#tt.grid<2x8x16, (d0, d1, d2) -> (d0 * 2 + (d1 floordiv 8) * 2 + d2 floordiv 8, d1, d2 mod 8)>, [0, 1, 2, 3]>
+print("divide batch by 2, 2x1x2 mesh\n", createDeviceAttr([2, 8, 16]))
+# CHECK: tt.device<#tt.grid<3x24x8, (d0, d1, d2) -> (d0 * 3 + d1 floordiv 8 + d2 floordiv 8, d1 mod 8, d2)>, [0, 1, 2, 3, 4, 5, 6, 7, 8]>
+print("divide batch by 3, 3x3x1 mesh\n", createDeviceAttr([3, 24, 8]))
+
+# ------------------------------------------------------------------------------
+
+print("\n=== nD ===")
+# CHECK: tt.device<#tt.grid<3x2x8x8, (d0, d1, d2, d3) -> (d0 * 2 + d1 + d2 floordiv 8 + d3 floordiv 8, d2, d3)>, [0, 1, 2, 3, 4, 5]>
+print("", createDeviceAttr([3, 2, 8, 8]))
+
+# ------------------------------------------------------------------------------
+
+print("\n=== Data parallel batch on single device ===")
+# CHECK: tt.device<#tt.grid<2x4x8, (d0, d1, d2) -> (0, d0 * 4 + d1, d2)>, [0]>
+print(
+    "divide batch by 2, top 4 rows get batch 0, bottom 4 rows get batch 1\n",
+    createDeviceAttr([2, 4, 8], affMap=amap(3, [c0, d0 * 4 + d1, d2])),
+)
+
+# ------------------------------------------------------------------------------
+
+print("\n=== Pipeline parallel ===")
+# CHECK: tt.device<#tt.grid<2x8x16, (d0, d1, d2) -> (d0 * 2 + (d1 floordiv 8) * 2 + d2 floordiv 8, d1, d2 mod 8)>, [0, 1, 2, 3]>
+print("view devices 0-3 in one way\n", createDeviceAttr([2, 8, 16], deviceStartIdx=0))
+# CHECK: tt.device<#tt.grid<16x16, (d0, d1) -> ((d0 floordiv 8) * 2 + d1 floordiv 8, d0 mod 8, d1 mod 8)>, [4, 5, 6, 7]>
+print("view devices 4-7 in another way\n", createDeviceAttr([16, 16], deviceStartIdx=4))
+
+# ------------------------------------------------------------------------------
+
+print("\n=== Reinterpreted Grids ===")
+# CHECK: tt.device<#tt.grid<8x8, (d0, d1) -> (0, d1, d0)>, [0]>
+print("transposed\n", createDeviceAttr([8, 8], affMap=amap(2, [c0, d1, d0])))
+# CHECK: tt.device<#tt.grid<1x64, (d0, d1) -> (0, d0 * 8 + d1 floordiv 8, d1 mod 8)>, [0]>
+print(
+    "extra wide\n",
+    createDeviceAttr(
+        [1, 64], affMap=amap(2, [c0, d0 * 8 + floordiv(d1, c(8)), d1 % 8])
+    ),
+)
+# CHECK: tt.device<#tt.grid<64x1, (d0, d1) -> (0, d1 * 8 + d0 floordiv 8, d0 mod 8)>, [0]>
+print(
+    "extra tall transposed\n",
+    createDeviceAttr(
+        [64, 1],
+        affMap=amap(2, [c0, d1 * 8 + floordiv(d0, c(8)), d0 % 8]),
+    ),
+)
+# CHECK: tt.device<#tt.grid<8x8, (d0, d1) -> (0, d0, (d0 + d1) mod 8)>, [0]>
+print(
+    "staircase systolic\n",
+    createDeviceAttr([8, 8], affMap=amap(2, [c0, d0, (d0 + d1) % 8])),
+)
diff --git a/test/python/simple_kernel.py b/test/python/simple_kernel.py
index 53d50ef54f..d1fc267ce2 100644
--- a/test/python/simple_kernel.py
+++ b/test/python/simple_kernel.py
@@ -225,7 +225,7 @@ def __init__(self, shape, dtype):
 
 def to_data_type(dtype):
     if dtype == "float32":
-        return tt.ir.DataType.Float32
+        return tt.DataType.Float32
     else:
         raise NotImplementedError(f"to_data_type {dtype} not implemented")
 
diff --git a/test/python/tensor_layout.py b/test/python/tensor_layout.py
index 58ad2160ad..5ebc90d7c4 100644
--- a/test/python/tensor_layout.py
+++ b/test/python/tensor_layout.py
@@ -14,9 +14,9 @@
 def createTensorLayout(
     shape,
     grid,
-    memorySpace=tt.ir.MemorySpace.DeviceL1,
+    memorySpace=tt.MemorySpace.DeviceL1,
     collapseIntervals=[(0, -1)],
-    oobVal=tt.ir.OOBVal.Undef,
+    oobVal=tt.OOBVal.Undef,
 ):
     if isinstance(grid, list) or isinstance(grid, tuple):
         grid = tt.ir.GridAttr.get(ctx, list(grid))
@@ -50,9 +50,9 @@ def parallelize(tensor, grid, collapseIntervals=[(0, -1)]):
 # CHECK: tensor<2x3x64x128xf32, #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<192x32xf32, #tt.memory_space<l1>>>>
 print(t0)
 # CHECK: #tt.layout<(d0, d1, d2, d3) -> (d0 * 192 + d1 * 64 + d2, d3), undef, <2x4>, memref<6x1x!tt.tile<32 x 32, bfp_bf8>, #tt.memory_space<l1>>>
-print(tilize(t0, tt.ir.DataType.BFP_BFloat8).wrapped())
+print(tilize(t0, tt.DataType.BFP_BFloat8).wrapped())
 print(parallelize(t0, [3, 2]).wrapped())
 
 t1 = createTensorLayout([2, 3, 64, 128], [2, 2, 4], collapseIntervals=[(1, -1)])
-print(tilize(t1, tt.ir.DataType.BFP_BFloat8).wrapped())
+print(tilize(t1, tt.DataType.BFP_BFloat8).wrapped())
 print(parallelize(t1, [3, 2]).wrapped())
diff --git a/test/ttmlir/Dialect/TTIR/test_allocate.mlir b/test/ttmlir/Dialect/TTIR/test_allocate.mlir
index d472d672b0..9b8230e413 100644
--- a/test/ttmlir/Dialect/TTIR/test_allocate.mlir
+++ b/test/ttmlir/Dialect/TTIR/test_allocate.mlir
@@ -1,6 +1,6 @@
 // RUN: ttmlir-opt --ttir-layout --ttir-allocate %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     // CHECK: %[[C:.*]] = "ttir.alloc"[[C:.*]]
     // CHECK-NOT: %[[C:.*]] = tensor.empty() : tensor<64x128xf32>
diff --git a/test/ttmlir/Dialect/TTIR/test_generic.mlir b/test/ttmlir/Dialect/TTIR/test_generic.mlir
index d60d833c2d..ff50eef4be 100644
--- a/test/ttmlir/Dialect/TTIR/test_generic.mlir
+++ b/test/ttmlir/Dialect/TTIR/test_generic.mlir
@@ -1,6 +1,6 @@
 // RUN: ttmlir-opt --ttir-generic %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     %0 = tensor.empty() : tensor<64x128xf32>
     // CHECK: %[[C:.*]] = "ttir.generic"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTIR/test_grid_set.mlir b/test/ttmlir/Dialect/TTIR/test_grid_set.mlir
index bae01a035c..9c867852e3 100644
--- a/test/ttmlir/Dialect/TTIR/test_grid_set.mlir
+++ b/test/ttmlir/Dialect/TTIR/test_grid_set.mlir
@@ -1,11 +1,11 @@
-// RUN: ttmlir-opt --ttir-layout --ttir-grid-set %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttir-grid-set %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     %0 = tensor.empty() : tensor<64x128xf32>
     // CHECK: #layout2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #l1_>>
-    // CHECK: %[[C:.*]] = "ttir.layout"[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttir.layout"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttir.to_layout"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttir.to_layout"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttir.multiply"[[C:.*]] -> tensor<64x128xf32, #layout2>
     %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
     return %1 : tensor<64x128xf32>
diff --git a/test/ttmlir/Dialect/TTIR/test_layout.mlir b/test/ttmlir/Dialect/TTIR/test_layout.mlir
index 41cf95c2e4..e59fea5760 100644
--- a/test/ttmlir/Dialect/TTIR/test_layout.mlir
+++ b/test/ttmlir/Dialect/TTIR/test_layout.mlir
@@ -1,10 +1,10 @@
 // RUN: ttmlir-opt --ttir-layout %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<8x64x128xf32>, %arg1: tensor<8x64x128xf32>) -> tensor<8x64x128xf32> {
     %0 = tensor.empty() : tensor<8x64x128xf32>
-    // CHECK: %[[C:.*]] = "ttir.layout"[[C:.*]]
-    // CHECK: %[[C:.*]] = "ttir.layout"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttir.to_layout"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttir.to_layout"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttir.multiply"[[C:.*]]
     %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8x64x128xf32>, tensor<8x64x128xf32>, tensor<8x64x128xf32>) -> tensor<8x64x128xf32>
     return %1 : tensor<8x64x128xf32>
diff --git a/test/ttmlir/Dialect/TTIR/tosa_to_ttir_multiply.mlir b/test/ttmlir/Dialect/TTIR/tosa_to_ttir_multiply.mlir
new file mode 100644
index 0000000000..fd35f0cd10
--- /dev/null
+++ b/test/ttmlir/Dialect/TTIR/tosa_to_ttir_multiply.mlir
@@ -0,0 +1,10 @@
+// RUN: ttmlir-opt --convert-tosa-to-ttir %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+    %0 = tosa.mul %arg0, %arg1 {shift = 0 : i8} : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+    // CHECK: %[[C:.*]] = tensor.empty[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttir.multiply"[[C:.*]]
+    return %0 : tensor<13x21x3xf32>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTMetal/simple_multiply.mlir b/test/ttmlir/Dialect/TTMetal/simple_multiply.mlir
index 191f9ca0a2..c1588daece 100644
--- a/test/ttmlir/Dialect/TTMetal/simple_multiply.mlir
+++ b/test/ttmlir/Dialect/TTMetal/simple_multiply.mlir
@@ -1,6 +1,6 @@
 // RUN: ttmlir-opt --ttir-generic --ttir-layout --ttir-generic-region-operands-to-memref --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttmetal.host_write"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTMetal/to_layout.mlir b/test/ttmlir/Dialect/TTMetal/to_layout.mlir
new file mode 100644
index 0000000000..f7681e4476
--- /dev/null
+++ b/test/ttmlir/Dialect/TTMetal/to_layout.mlir
@@ -0,0 +1,12 @@
+// RUN: ttmlir-opt --ttir-allocate --convert-ttir-to-ttmetal %s | FileCheck %s
+#l1_ = #tt.memory_space<l1>
+#layout = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
+#layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x4>, memref<64x32xf32, #l1_>>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32, #layout>) -> tensor<64x128xf32, #layout1> {
+    // CHECK: %[[C:.*]] = "ttmetal.alloc"[[C:.*]]
+    %0 = tensor.empty() : tensor<64x128xf32, #layout1>
+    %1 = "ttir.to_layout"(%arg0, %0) : (tensor<64x128xf32, #layout>, tensor<64x128xf32, #layout1>) -> tensor<64x128xf32, #layout1>
+    return %1 : tensor<64x128xf32, #layout1>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
index de9d2d9ea3..ce733171c6 100644
--- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
+++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc.mlir
@@ -1,7 +1,7 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #loc = loc("test_ops.py:17_0_0":0:0)
-module @pybuda_graph attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module @pybuda_graph attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
     // CHECK: #layout1 = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #system>>
     // CHECK: #layout2 = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #l1_>>
diff --git a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
index ae262370c9..8e068bcdba 100644
--- a/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
+++ b/test/ttmlir/Dialect/TTNN/multiple_add_with_loc_grid_override.mlir
@@ -1,7 +1,7 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="override-grid-sizes=add_1_0=4x4,add_2_0=4x4" %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #loc = loc("test_ops.py:17_0_0":0:0)
-module @pybuda_graph attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module @pybuda_graph attributes {} {
   func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
     // CHECK: #layout1 = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #system>>
     // CHECK: #layout2 = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <4x4>, memref<8x8xf32, #l1_>>
diff --git a/test/ttmlir/Dialect/TTNN/simple_ge.mlir b/test/ttmlir/Dialect/TTNN/simple_ge.mlir
index e2d5e07cfc..762345b5f9 100644
--- a/test/ttmlir/Dialect/TTNN/simple_ge.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_ge.mlir
@@ -1,6 +1,6 @@
-// RUN: ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTNN/simple_matmul.mlir b/test/ttmlir/Dialect/TTNN/simple_matmul.mlir
index e786b015f1..411b60f034 100644
--- a/test/ttmlir/Dialect/TTNN/simple_matmul.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_matmul.mlir
@@ -1,6 +1,6 @@
-// RUN: ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
     %0 = tensor.empty() : tensor<64x96xbf16>
     // CHECK: %[[C:.*]] = "ttnn.matmul"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTNN/simple_mean.mlir b/test/ttmlir/Dialect/TTNN/simple_mean.mlir
new file mode 100644
index 0000000000..fe2f586af5
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/simple_mean.mlir
@@ -0,0 +1,15 @@
+// RUN: ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
+#any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
+module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = 8x8, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+  func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x32xbf16> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+    %0 = tensor.empty() : tensor<512x32xbf16>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.mean"[[C:.*]]
+    %1 = "ttir.mean"(%arg0, %0) <{dim_arg = [-1: i32], keep_dim = true, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x32xbf16>) -> tensor<512x32xbf16>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<512x32xbf16>
+  }
+}
diff --git a/test/ttmlir/Dialect/TTNN/simple_multiply.mlir b/test/ttmlir/Dialect/TTNN/simple_multiply.mlir
index 1c97f74540..af9af72b00 100644
--- a/test/ttmlir/Dialect/TTNN/simple_multiply.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_multiply.mlir
@@ -1,6 +1,6 @@
-// RUN: ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTNN/simple_relu.mlir b/test/ttmlir/Dialect/TTNN/simple_relu.mlir
index 3ae34c6be8..1545e921c7 100644
--- a/test/ttmlir/Dialect/TTNN/simple_relu.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_relu.mlir
@@ -1,6 +1,6 @@
-// RUN: ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTNN/simple_subtract.mlir b/test/ttmlir/Dialect/TTNN/simple_subtract.mlir
index a919973fc1..18cabdab0a 100644
--- a/test/ttmlir/Dialect/TTNN/simple_subtract.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_subtract.mlir
@@ -1,6 +1,6 @@
-// RUN: ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTNN/simple_sum.mlir b/test/ttmlir/Dialect/TTNN/simple_sum.mlir
index 5008a99b28..fa7e51b2a5 100644
--- a/test/ttmlir/Dialect/TTNN/simple_sum.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_sum.mlir
@@ -1,6 +1,6 @@
-// RUN: ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x32xbf16> {
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTNN/softmax/simple_softmax.mlir b/test/ttmlir/Dialect/TTNN/softmax/simple_softmax.mlir
index f0e329b3e8..7c17037e75 100644
--- a/test/ttmlir/Dialect/TTNN/softmax/simple_softmax.mlir
+++ b/test/ttmlir/Dialect/TTNN/softmax/simple_softmax.mlir
@@ -1,6 +1,6 @@
-// RUN: ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x1024xbf16> {
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
@@ -8,13 +8,13 @@ module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid
     // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.softmax"[[C:.*]]
     // Check for positive dimension attribute
-    %1 = "ttir.softmax"(%arg0, %0) <{dimension = 1 : i32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
+    %1 = "ttir.softmax"(%arg0, %0) <{dimension = 1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
     // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
     %2 = tensor.empty() : tensor<512x1024xbf16>
     // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
     // CHECK: %[[C:.*]] = "ttnn.softmax"[[C:.*]]
     // Check for negative dimension attribute
-    %3 = "ttir.softmax"(%1, %2) <{dimension = -1 : i32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
+    %3 = "ttir.softmax"(%1, %2) <{dimension = -1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
     // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
     // CHECK: "ttnn.close_device"[[C:.*]]
     return %3 : tensor<512x1024xbf16>
diff --git a/test/ttmlir/Dialect/TTNN/softmax/softmax_negative_1.mlir b/test/ttmlir/Dialect/TTNN/softmax/softmax_negative_1.mlir
index eea94a5c50..b3608e3a85 100644
--- a/test/ttmlir/Dialect/TTNN/softmax/softmax_negative_1.mlir
+++ b/test/ttmlir/Dialect/TTNN/softmax/softmax_negative_1.mlir
@@ -1,10 +1,10 @@
-// RUN: not ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s 2>&1 | FileCheck %s
+// RUN: not ttmlir-opt --ttir-load-system-desc --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s 2>&1 | FileCheck %s
 // CHECK: error: 'ttir.softmax' op Dimension attribute must be within the bounds of the input tensor
 #any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x1024xbf16> {
     %0 = tensor.empty() : tensor<512x1024xbf16>
-    %1 = "ttir.softmax"(%arg0, %0) <{dimension = 2 : i32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
+    %1 = "ttir.softmax"(%arg0, %0) <{dimension = 2 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
     return %1 : tensor<512x1024xbf16>
   }
 }
diff --git a/test/ttmlir/Dialect/TTNN/softmax/softmax_negative_2.mlir b/test/ttmlir/Dialect/TTNN/softmax/softmax_negative_2.mlir
index b7282d75c6..ffd730fbcb 100644
--- a/test/ttmlir/Dialect/TTNN/softmax/softmax_negative_2.mlir
+++ b/test/ttmlir/Dialect/TTNN/softmax/softmax_negative_2.mlir
@@ -1,10 +1,10 @@
-// RUN: not ttmlir-opt --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s 2>&1 | FileCheck %s
+// RUN: not ttmlir-opt --ttir-load-system-desc --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s 2>&1 | FileCheck %s
 // CHECK: error: 'ttir.softmax' op Dimension attribute must be within the bounds of the input tensor
 #any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x1024xbf16> {
     %0 = tensor.empty() : tensor<512x1024xbf16>
-    %1 = "ttir.softmax"(%arg0, %0) <{dimension = -3 : i32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
+    %1 = "ttir.softmax"(%arg0, %0) <{dimension = -3 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
     return %1 : tensor<512x1024xbf16>
   }
 }
diff --git a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir
index 11412c2e1d..30690844d4 100644
--- a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir
+++ b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline.mlir
@@ -1,6 +1,6 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     // CHECK: #layout2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #l1_>>
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
diff --git a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
index 8d9d1284ff..880093c20c 100644
--- a/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
+++ b/test/ttmlir/Dialect/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
@@ -1,6 +1,6 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-grid-set=false" %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-module attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = <8x8>, l1_size = 1048576, num_dram_channels = 12, dram_channel_size = 1048576, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32}], [0], [<pcie|host_mmio>], [<0, 0, 0, 0>]>} {
+module attributes {} {
   func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
     // CHECK: #layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
     // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
diff --git a/test/ttmlir/Silicon/TTNN/multiple_add_with_loc.mlir b/test/ttmlir/Silicon/TTNN/multiple_add_with_loc.mlir
new file mode 100644
index 0000000000..7b0e4ed217
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/multiple_add_with_loc.mlir
@@ -0,0 +1,30 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+#loc = loc("test_ops.py:17_0_0":0:0)
+module @pybuda_graph attributes {} {
+  func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
+    // CHECK: #layout1 = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #system>>
+    // CHECK: #layout2 = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #l1_>>
+    %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
+    // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #layout2>
+    %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5)
+    %2 = tensor.empty() : tensor<1x32x32xf32> loc(#loc6)
+    // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #layout2>
+    %3 = "ttir.add"(%1, %arg0, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc6)
+    %4 = tensor.empty() : tensor<1x32x32xf32> loc(#loc7)
+    // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #layout2>
+    %5 = "ttir.add"(%arg2, %arg1, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc7)
+    // CHECK: return %20, %22 : tensor<1x32x32xf32, #layout1>, tensor<1x32x32xf32, #layout1>
+    return %3, %5 : tensor<1x32x32xf32>, tensor<1x32x32xf32> loc(#loc4)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("test_ops.py:17_0_0":0:4)
+#loc2 = loc("test_ops.py:17_0_0":0:6)
+#loc3 = loc("test_ops.py:17_0_0":0:3)
+#loc4 = loc(unknown)
+#loc5 = loc("add_1_0"(#loc1))
+#loc6 = loc("add_2_0"(#loc2))
+#loc7 = loc("add_0"(#loc3))
diff --git a/test/ttmlir/Silicon/TTNN/multiple_add_with_loc_grid_override.mlir b/test/ttmlir/Silicon/TTNN/multiple_add_with_loc_grid_override.mlir
new file mode 100644
index 0000000000..26e3d2356a
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/multiple_add_with_loc_grid_override.mlir
@@ -0,0 +1,31 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="override-grid-sizes=add_1_0=4x4,add_2_0=4x4" %s > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+#loc = loc("test_ops.py:17_0_0":0:0)
+module @pybuda_graph attributes {} {
+  func.func @main(%arg0: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg1: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0), %arg2: tensor<1x32x32xf32> loc("test_ops.py:17_0_0":0:0)) -> (tensor<1x32x32xf32>, tensor<1x32x32xf32>) {
+    // CHECK: #layout1 = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #system>>
+    // CHECK: #layout2 = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <4x4>, memref<8x8xf32, #l1_>>
+    // CHECK: #layout3 = #tt.layout<(d0, d1, d2) -> (d0 * 32 + d1, d2), undef, <8x8>, memref<4x4xf32, #l1_>>
+    %0 = tensor.empty() : tensor<1x32x32xf32> loc(#loc5)
+    // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #layout2>
+    %1 = "ttir.add"(%arg1, %arg2, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc5)
+    %2 = tensor.empty() : tensor<1x32x32xf32> loc(#loc6)
+    // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #layout2>
+    %3 = "ttir.add"(%1, %arg0, %2) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc6)
+    %4 = tensor.empty() : tensor<1x32x32xf32> loc(#loc7)
+    // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x32x32xf32, #layout3>
+    %5 = "ttir.add"(%arg2, %arg1, %4) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x32x32xf32>, tensor<1x32x32xf32>, tensor<1x32x32xf32>) -> tensor<1x32x32xf32> loc(#loc7)
+    // CHECK: return %20, %22 : tensor<1x32x32xf32, #layout1>, tensor<1x32x32xf32, #layout1>
+    return %3, %5 : tensor<1x32x32xf32>, tensor<1x32x32xf32> loc(#loc4)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc("test_ops.py:17_0_0":0:4)
+#loc2 = loc("test_ops.py:17_0_0":0:6)
+#loc3 = loc("test_ops.py:17_0_0":0:3)
+#loc4 = loc(unknown)
+#loc5 = loc("add_1_0"(#loc1))
+#loc6 = loc("add_2_0"(#loc2))
+#loc7 = loc("add_0"(#loc3))
diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
new file mode 100644
index 0000000000..ec071709fc
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
@@ -0,0 +1,53 @@
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+
+func.func @subtract(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.subtract"[[C:.*]]
+  %1 = "ttir.subtract"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<64x128xf32>
+}
+
+func.func @multiply(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]]
+  %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<64x128xf32>
+}
+
+func.func @relu(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.relu"[[C:.*]]
+  %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<64x128xf32>
+}
+
+func.func @ge(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+  %0 = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+  // CHECK: %[[C:.*]] = "ttnn.ge"[[C:.*]]
+  %1 = "ttir.ge"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+  // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+  // CHECK: "ttnn.close_device"[[C:.*]]
+  return %1 : tensor<64x128xf32>
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_ge.mlir b/test/ttmlir/Silicon/TTNN/simple_ge.mlir
new file mode 100644
index 0000000000..7e65fbe36f
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_ge.mlir
@@ -0,0 +1,18 @@
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+    %0 = tensor.empty() : tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.ge"[[C:.*]]
+    %1 = "ttir.ge"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<64x128xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_matmul.mlir b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
new file mode 100644
index 0000000000..697ead5da0
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_matmul.mlir
@@ -0,0 +1,13 @@
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>) -> tensor<64x96xbf16> {
+    %0 = tensor.empty() : tensor<64x96xbf16>
+    // CHECK: %[[C:.*]] = "ttnn.matmul"[[C:.*]]
+    %1 = "ttir.matmul"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16>
+    return %1 : tensor<64x96xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_multiply.mlir b/test/ttmlir/Silicon/TTNN/simple_multiply.mlir
new file mode 100644
index 0000000000..07bf4c0c58
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_multiply.mlir
@@ -0,0 +1,18 @@
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+    %0 = tensor.empty() : tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]]
+    %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<64x128xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_relu.mlir b/test/ttmlir/Silicon/TTNN/simple_relu.mlir
new file mode 100644
index 0000000000..d01f08c5f7
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_relu.mlir
@@ -0,0 +1,18 @@
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+    %0 = tensor.empty() : tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.relu"[[C:.*]]
+    %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<64x128xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_subtract.mlir b/test/ttmlir/Silicon/TTNN/simple_subtract.mlir
new file mode 100644
index 0000000000..f216ba6cf8
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_subtract.mlir
@@ -0,0 +1,18 @@
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+    %0 = tensor.empty() : tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.subtract"[[C:.*]]
+    %1 = "ttir.subtract"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<64x128xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/simple_sum.mlir b/test/ttmlir/Silicon/TTNN/simple_sum.mlir
new file mode 100644
index 0000000000..fec8f943b6
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/simple_sum.mlir
@@ -0,0 +1,18 @@
+// RUN: ttmlir-opt --ttir-load-system-desc --ttir-implicit-device --ttir-layout --ttnn-open-device --convert-ttir-to-ttnn %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<512x1024xbf16>) -> tensor<512x32xbf16> {
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+    %0 = tensor.empty() : tensor<512x32xbf16>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.sum"[[C:.*]]
+    %1 = "ttir.sum"(%arg0, %0) <{dim_arg = [-1: i32], keep_dim = true, operand_constraints = [#any_device, #any_device]}> : (tensor<512x1024xbf16>, tensor<512x32xbf16>) -> tensor<512x32xbf16>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<512x32xbf16>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/ttir_to_ttnn_pipeline.mlir b/test/ttmlir/Silicon/TTNN/ttir_to_ttnn_pipeline.mlir
new file mode 100644
index 0000000000..12dad419f1
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/ttir_to_ttnn_pipeline.mlir
@@ -0,0 +1,18 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+    // CHECK: #layout2 = #tt.layout<(d0, d1) -> (d0, d1), undef, <8x8>, memref<8x16xf32, #l1_>>
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+    %0 = tensor.empty() : tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]] -> tensor<64x128xf32, #layout2>
+    %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<64x128xf32>
+  }
+}
diff --git a/test/ttmlir/Silicon/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir b/test/ttmlir/Silicon/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
new file mode 100644
index 0000000000..e61b101217
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/ttir_to_ttnn_pipeline_custom_opt.mlir
@@ -0,0 +1,18 @@
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-grid-set=false" %s  > %t.mlir
+// RUN: FileCheck %s --input-file=%t.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+
+#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
+module attributes {} {
+  func.func @forward(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+    // CHECK: #layout1 = #tt.layout<(d0, d1) -> (d0, d1), undef, <1x1>, memref<64x128xf32, #l1_>>
+    // CHECK: %[[C:.*]] = "ttnn.open_device"[[C:.*]]
+    // CHECK: %[[C:.*]] = "ttnn.full"[[C:.*]]
+    %0 = tensor.empty() : tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.multiply"[[C:.*]] -> tensor<64x128xf32, #layout1>
+    %1 = "ttir.multiply"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
+    // CHECK: %[[C:.*]] = "ttnn.to_memory_config"[[C:.*]]
+    // CHECK: "ttnn.close_device"[[C:.*]]
+    return %1 : tensor<64x128xf32>
+  }
+}
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 9a2f327a64..7d9493899a 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -1,16 +1,23 @@
 include(ExternalProject)
 
 if ("$ENV{ARCH_NAME}" STREQUAL "grayskull")
-set(ARCH_NAME "grayskull")
-set(ARCH_EXTRA_DIR "grayskull")
+  set(ARCH_NAME "grayskull")
+  set(ARCH_EXTRA_DIR "grayskull")
 elseif ("$ENV{ARCH_NAME}" STREQUAL "wormhole_b0")
-set(ARCH_NAME "wormhole")
-set(ARCH_EXTRA_DIR "wormhole/wormhole_b0_defines")
+  set(ARCH_NAME "wormhole")
+  set(ARCH_EXTRA_DIR "wormhole/wormhole_b0_defines")
 elseif ("$ENV{ARCH_NAME}" STREQUAL "blackhole")
-set(ARCH_NAME "blackhole")
-set(ARCH_EXTRA_DIR "blackhole")
+  set(ARCH_NAME "blackhole")
+  set(ARCH_EXTRA_DIR "blackhole")
 else()
-message(FATAL_ERROR "Unsupported ARCH_NAME: $ENV{ARCH_NAME}")
+  message(FATAL_ERROR "Unsupported ARCH_NAME: $ENV{ARCH_NAME}")
+endif()
+
+if (TT_RUNTIME_ENABLE_PERF_TRACE)
+  add_compile_definitions(TRACY_ENABLE)
+  set(ENV{ENABLE_TRACY} "1")
+else()
+  set(ENV{ENABLE_TRACY} "0")
 endif()
 
 set(TTMETAL_INCLUDE_DIRS
@@ -39,18 +46,19 @@ set(TTEAGER_LIBRARY_PATH ${TTEAGER_LIBRARY_PATH} PARENT_SCOPE)
 
 
 ExternalProject_Add(
-    tt-metal
-    PREFIX ${TTMLIR_SOURCE_DIR}/third_party/tt-metal
-    CMAKE_GENERATOR Ninja
-    CMAKE_ARGS
-      -DCMAKE_BUILD_TYPE=Release
-      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-      -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
-    GIT_REPOSITORY https://github.com/tenstorrent/tt-metal.git
-    GIT_TAG v0.49.0
-    GIT_PROGRESS ON
-    BUILD_BYPRODUCTS ${TTNN_LIBRARY_PATH} ${TTMETAL_LIBRARY_PATH} ${TTEAGER_LIBRARY_PATH}
+  tt-metal
+  PREFIX ${TTMLIR_SOURCE_DIR}/third_party/tt-metal
+  CMAKE_GENERATOR Ninja
+  CMAKE_ARGS
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
+    -DTRACY_ENABLE=${TT_RUNTIME_ENABLE_PERF_TRACE}
+  GIT_REPOSITORY https://github.com/tenstorrent/tt-metal.git
+  GIT_TAG v0.49.0
+  GIT_PROGRESS ON
+  BUILD_BYPRODUCTS ${TTNN_LIBRARY_PATH} ${TTMETAL_LIBRARY_PATH} ${TTEAGER_LIBRARY_PATH}
 )
 
 set_target_properties(tt-metal PROPERTIES EXCLUDE_FROM_ALL TRUE)
diff --git a/tools/ttmlir-opt/CMakeLists.txt b/tools/ttmlir-opt/CMakeLists.txt
index 1ef9b35cb3..ecdfbec892 100644
--- a/tools/ttmlir-opt/CMakeLists.txt
+++ b/tools/ttmlir-opt/CMakeLists.txt
@@ -1,6 +1,6 @@
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
-set(LIBS ${dialect_libs} ${conversion_libs} TTMLIRTTNNToEmitC MLIROptLib MLIRTargetCpp TTMLIR)
+set(LIBS ${dialect_libs} ${conversion_libs} MLIROptLib MLIRTargetCpp TTMLIR)
 add_llvm_executable(ttmlir-opt ttmlir-opt.cpp)
 
 llvm_update_compile_flags(ttmlir-opt)
diff --git a/tools/ttmlir-translate/CMakeLists.txt b/tools/ttmlir-translate/CMakeLists.txt
index 8e8170464c..b03fb80681 100644
--- a/tools/ttmlir-translate/CMakeLists.txt
+++ b/tools/ttmlir-translate/CMakeLists.txt
@@ -1,11 +1,9 @@
 get_property(translation_libs GLOBAL PROPERTY MLIR_TRANSLATION_LIBS)
-
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+set(LIBS ${translation_libs} ${dialect_libs} TTMLIRTTNNToEmitC TTNNTargetFlatbuffer)
 add_llvm_executable(ttmlir-translate ttmlir-translate.cpp)
 
 llvm_update_compile_flags(ttmlir-translate)
-target_link_libraries(ttmlir-translate
-  PRIVATE
-  ${translation_libs}
-  )
+target_link_libraries(ttmlir-translate PRIVATE ${LIBS})
 
 mlir_check_link_libraries(ttmlir-translate)
diff --git a/tools/ttmlir-translate/ttmlir-translate.cpp b/tools/ttmlir-translate/ttmlir-translate.cpp
index 27709f77fb..a8c53d04df 100644
--- a/tools/ttmlir-translate/ttmlir-translate.cpp
+++ b/tools/ttmlir-translate/ttmlir-translate.cpp
@@ -8,8 +8,18 @@
 
 using namespace mlir;
 
+namespace mlir::tt::ttnn {
+void registerTTNNToFlatbuffer();
+} // namespace mlir::tt::ttnn
+
 // Place to register all the custom translations
-static void registerCustomTranslations() {}
+static void registerCustomTranslations() {
+  static bool initOnce = []() {
+    mlir::tt::ttnn::registerTTNNToFlatbuffer();
+    return true;
+  }();
+  (void)initOnce;
+}
 
 int main(int argc, char **argv) {
   registerAllTranslations();