From 6a219f20471133604c523a09ea1f75152d4e3c66 Mon Sep 17 00:00:00 2001
From: Stefan Djordjevic <sdjordjevic@tenstorrent.com>
Date: Mon, 27 Jan 2025 17:14:04 +0000
Subject: [PATCH] Adding ttnn_to_dtype op in TTNN dialect

---
 include/ttmlir/Dialect/TTNN/IR/TTNNOps.td     |  15 ++
 include/ttmlir/Target/TTNN/program.fbs        |   7 +
 lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp    |  28 ++
 .../TTNN/Transforms/TTNNDecomposeLayouts.cpp  | 219 +++++++++++-----
 lib/Target/TTNN/TTNNToFlatbuffer.cpp          |  16 ++
 .../include/tt/runtime/detail/workarounds.h   |  14 +-
 runtime/lib/common/workarounds.cpp            |   4 +-
 runtime/lib/ttnn/operations/CMakeLists.txt    |   1 +
 .../lib/ttnn/operations/layout/to_dtype.cpp   |  21 ++
 runtime/lib/ttnn/operations/layout/to_dtype.h |  15 ++
 .../lib/ttnn/operations/layout/typecast.cpp   |   8 +-
 runtime/lib/ttnn/program.cpp                  |   4 +
 test/lit.cfg.py                               |   6 +
 test/lit.site.cfg.py.in                       |   1 +
 .../decomposing_layouts_from_host.mlir        | 237 +++++++++++++++++
 test/ttmlir/Dialect/TTNN/simple_clamp.mlir    |   3 +-
 test/ttmlir/EmitC/TTNN/other/embedding.mlir   |   3 -
 .../create_system_desc_device.mlir            |   4 +
 .../decomposing_layouts_from_host.mlir        | 240 ++++++++++++++++++
 .../TTNN/perf_unit/test_perf_clamp.mlir       |   3 +-
 test/ttmlir/Silicon/TTNN/simple_eltwise.mlir  |   3 +-
 ...ract-and-replace-system-desc-and-device.py |  46 ++++
 22 files changed, 802 insertions(+), 96 deletions(-)
 create mode 100644 runtime/lib/ttnn/operations/layout/to_dtype.cpp
 create mode 100644 runtime/lib/ttnn/operations/layout/to_dtype.h
 create mode 100644 test/ttmlir/Dialect/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir
 create mode 100644 test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir
 create mode 100644 tools/scripts/extract-and-replace-system-desc-and-device.py

diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
index 8a2e252c1f..b60e897c98 100644
--- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
+++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td
@@ -78,6 +78,21 @@ def TTNN_TypecastOp : TTNN_Op<"typecast"> {
     let results = (outs AnyRankedTensor:$result);
 }
 
+def TTNN_ToDTypeOp : TTNN_Op<"to_dtype"> {
+    let summary = "ToDType op.";
+    let description = [{
+      This op converts the data type of the input tensor based on the given data type on the host.
+
+      Args:
+        - :attr:`input`: the ttnn.Tensor
+        - :attr:`dtype`: `ttnn` data type.
+    }];
+
+    let arguments = (ins AnyRankedTensor:$input,
+                         TT_DataTypeAttr:$dtype);
+    let results = (outs AnyRankedTensor:$result);
+}
+
 def TTNN_ToDeviceOp : TTNN_Op<"to_device"> {
     let summary = "ToDevice op.";
     let description = [{
diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
index ecb5c6de54..c11dfa563c 100644
--- a/include/ttmlir/Target/TTNN/program.fbs
+++ b/include/ttmlir/Target/TTNN/program.fbs
@@ -24,6 +24,12 @@ table ToLayoutOp {
   out: tt.target.TensorRef;
 }
 
+table ToDTypeOp {
+  in: tt.target.TensorRef;
+  dtype: tt.target.DataType;
+  out: tt.target.TensorRef;
+}
+
 table TypecastOp {
   in: tt.target.TensorRef;
   dtype: tt.target.DataType;
@@ -396,6 +402,7 @@ union OpType {
   GetDeviceOp,
   ToMemoryConfigOp,
   ToLayoutOp,
+  ToDTypeOp,
   TypecastOp,
   ToDeviceOp,
   FromDeviceOp,
diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
index a22d1e4c6b..b9417d4d82 100644
--- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
+++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp
@@ -735,6 +735,33 @@ class TypecastOpConversionPattern
 };
 } // namespace
 
+// ToDTypeOp conversion pattern
+//
+namespace {
+class ToDTypeOpConversionPattern
+    : public TTNNToEmitCBaseOpConversionPattern<ttnn::ToDTypeOp> {
+
+public:
+  using TTNNToEmitCBaseOpConversionPattern<
+      ttnn::ToDTypeOp>::TTNNToEmitCBaseOpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ttnn::ToDTypeOp srcOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    ArrayAttr arrayAttrs = rewriter.getArrayAttr(
+        {mlir::IntegerAttr::get(rewriter.getIndexType(), 0),
+         ttnn_to_emitc::utils::convertDType(rewriter, srcOp.getDtypeAttr())});
+
+    rewriter.replaceOpWithNewOp<emitc::CallOpaqueOp>(
+        srcOp, this->getTypeConverter()->convertType(srcOp.getType()),
+        this->convertOpName(srcOp), arrayAttrs, nullptr, adaptor.getOperands());
+
+    return success();
+  }
+};
+} // namespace
+
 // ToMemoryConfig conversion pattern
 //
 namespace {
@@ -1128,6 +1155,7 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx,
   // clang-format off
   patterns.add<ToLayoutOpConversionPattern,
                ToMemoryConfigOpConversionPattern,
+               DefaultOpConversionPattern<ttnn::ToDTypeOp>,
                TypecastOpConversionPattern,
                ToDeviceOpConversionPattern,
                FromDeviceOpConversionPattern,
diff --git a/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp b/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp
index 2bf4d90085..b988c96b3a 100644
--- a/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp
+++ b/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp
@@ -2,8 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h"
 #include "ttmlir/Dialect/TTNN/Transforms/Passes.h"
 #include "ttmlir/Dialect/TTNN/Utils/Utils.h"
+#include <mlir/IR/Value.h>
 
 namespace mlir::tt::ttnn {
 #define GEN_PASS_DEF_TTNNDECOMPOSELAYOUTS
@@ -63,12 +66,12 @@ class TTNNDecomposeLayouts
     bool createToDeviceOp = false;
     bool createFromDeviceOp = false;
     bool createToLayoutOp = false;
-    bool createTypecastOp = false;
+    bool createDataTypeCastOp = false;
     bool createToMemoryConfigOp = false;
 
     bool createSomeOp() const {
-      return createToLayoutOp or createTypecastOp or createToDeviceOp or
-             createFromDeviceOp or createToMemoryConfigOp;
+      return createToLayoutOp || createDataTypeCastOp || createToDeviceOp ||
+             createFromDeviceOp || createToMemoryConfigOp;
     }
 
     void print() const {
@@ -80,7 +83,7 @@ class TTNNDecomposeLayouts
                    << "\t"
                    << "CreateToLayoutOp: " << createToLayoutOp << "\n"
                    << "\t"
-                   << "CreateTypecastOp: " << createTypecastOp << "\n"
+                   << "CreateTypecastOp: " << createDataTypeCastOp << "\n"
                    << "\t"
                    << "CreateToMemoryConfigOp: " << createToMemoryConfigOp
                    << "\n"
@@ -146,7 +149,7 @@ class TTNNDecomposeLayouts
     opsToCreate.createFromDeviceOp =
         (input.bufferType != output.bufferType) and output.isOnHost();
 
-    opsToCreate.createTypecastOp = input.dataType != output.dataType;
+    opsToCreate.createDataTypeCastOp = input.dataType != output.dataType;
     opsToCreate.createToLayoutOp = input.layoutEnum != output.layoutEnum;
     // TODO(bug #665):
     // Insert a ToLayoutOp manually if we're moving from device to host to
@@ -286,30 +289,58 @@ class TTNNDecomposeLayouts
     RankedTensorType newResultType =
         utils::createRankedTensorTypeWithElementType(inputType,
                                                      memrefElementType);
+
+    TTNNLayoutAttr inputLayout =
+        mlir::cast<TTNNLayoutAttr>(inputType.getEncoding());
+
     return this->createOp<ttnn::ToLayoutOp>(
         rewriter, op, newResultType, currentInput, layoutAttr,
         /*dtype*/ nullptr,
-        /*memory_config*/ nullptr, /*device*/ nullptr);
+        /*memory_config*/ nullptr,
+        inputLayout.isSystemBufferType() ? nullptr : info.device);
   }
 
-  mlir::Value createTypecastOpIfNeeded(ttnn::ToLayoutOp op,
-                                       IRRewriter &rewriter,
-                                       mlir::Value currentInput,
-                                       const OpCreationInfo &info) const {
-    if (not info.opsToCreate.createTypecastOp) {
-      return currentInput;
-    }
+  template <typename OpType>
+  mlir::Value createDataTypeCastingOp(ttnn::ToLayoutOp op, IRRewriter &rewriter,
+                                      mlir::Value currentInput,
+                                      const OpCreationInfo &info) const {
     DataTypeAttr dtypeAttr =
         DataTypeAttr::get(op.getContext(), info.output.dataType);
     RankedTensorType currentInputType =
         mlir::cast<RankedTensorType>(currentInput.getType());
+    TTNNLayoutAttr currentInputLayout =
+        mlir::cast<TTNNLayoutAttr>(currentInputType.getEncoding());
     Type nmemrefElementType = utils::getElementType(
-        op.getContext(), info.input.layoutEnum, info.output.dataType);
+        op.getContext(), currentInputLayout.getLayout(), info.output.dataType);
     RankedTensorType newResultType =
         utils::createRankedTensorTypeWithElementType(currentInputType,
                                                      nmemrefElementType);
-    return this->createOp<ttnn::TypecastOp>(rewriter, op, newResultType,
-                                            currentInput, dtypeAttr);
+    return this->createOp<OpType>(rewriter, op, newResultType, currentInput,
+                                  dtypeAttr);
+  }
+
+  mlir::Value
+  createDataTypeCastingOpIfNeeded(ttnn::ToLayoutOp op, IRRewriter &rewriter,
+                                  mlir::Value currentInput,
+                                  const OpCreationInfo &info) const {
+    if (!info.opsToCreate.createDataTypeCastOp) {
+      return currentInput;
+    }
+
+    RankedTensorType currentInputType =
+        mlir::cast<RankedTensorType>(currentInput.getType());
+
+    TTNNLayoutAttr inputLayout =
+        mlir::cast<TTNNLayoutAttr>(currentInputType.getEncoding());
+    if (inputLayout.isSystemBufferType()) {
+      // If the input tensor is on host, we need to cast it on the host
+      return this->createDataTypeCastingOp<ttnn::ToDTypeOp>(op, rewriter,
+                                                            currentInput, info);
+    }
+
+    // If the input tensor is on device, we can cast it on the device.
+    return this->createDataTypeCastingOp<ttnn::TypecastOp>(op, rewriter,
+                                                           currentInput, info);
   }
 
   mlir::Value createToMemoryConfigOpIfNeeded(ttnn::ToLayoutOp op,
@@ -350,7 +381,19 @@ class TTNNDecomposeLayouts
     const LayoutInfo &output = info.output;
     assert(input.dataType == output.dataType &&
            "Data type should be the same if we're not creating typecast op");
-    /* if we should untilize, untilize on host */
+
+    // If the output is on the host, we can perform layout conversion on host.
+    if (output.isOnHost()) {
+      currentInput =
+          this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter,
+                                                          currentInput, info);
+      op.getResult().replaceAllUsesWith(currentInput);
+      return;
+    }
+
+    // If the output is on device and we should untilize, we can untilize on
+    // host and than move the tensor to device.
     if (info.shouldUntilize()) {
       currentInput =
           this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info);
@@ -362,9 +405,10 @@ class TTNNDecomposeLayouts
       return;
     }
 
-    /* If we should tilize, and the data type is bfloat16, we can tilize on
-     * device */
-    if (info.shouldTilize() and output.dataType == DataType::BFloat16) {
+    // Tilizing on device is supported only for bf16 data format. If the tensor
+    // is bf16 and the output is on device, we can move the tensor to device and
+    // perform the tilization on device.
+    if (info.shouldTilize() && output.dataType == DataType::BFloat16) {
       currentInput =
           this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info);
       currentInput =
@@ -375,9 +419,9 @@ class TTNNDecomposeLayouts
       return;
     }
 
-    /* If we should tilize, and the data type is not bfloat16, we tilize on host
-     */
-    if (info.shouldTilize() and output.dataType != DataType::BFloat16) {
+    // Otherwise, if tensor is not in bf16 data format, we perform tilizing on
+    // host and than move the tensor to device.
+    if (info.shouldTilize() && output.dataType != DataType::BFloat16) {
       currentInput =
           this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info);
       currentInput =
@@ -400,24 +444,40 @@ class TTNNDecomposeLayouts
     assert(input.layoutEnum == output.layoutEnum &&
            "Layout should be the same if we're not creating a ToLayoutOp");
 
-    /* If the output is already tilized, we can typecast on device */
-    if (output.isTilized()) {
+    // If the output is on the host, we can perform the data type cast directly
+    // on the host.
+    if (output.isOnHost()) {
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
+      currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter,
+                                                          currentInput, info);
+      op.getResult().replaceAllUsesWith(currentInput);
+      return;
+    }
+
+    // Device typecast only supports tilized tensors. Therefore, if the output
+    // tensor is in row-major (input as well is in row-major) and resides on the
+    // device, we should perform the data type casting on the host before moving
+    // the tensor back to the device.
+    if (!output.isTilized()) {
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput =
           this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info);
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
       currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter,
                                                           currentInput, info);
       op.getResult().replaceAllUsesWith(currentInput);
       return;
     }
 
-    /* If the output is not tilized, typecast on host */
-    if (not output.isTilized()) {
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+    // If the output tensor is tilized and resides on the device, we can move
+    // the tensor to the device and perform the data type cast directly on the
+    // device.
+    if (output.isTilized()) {
       currentInput =
           this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter,
                                                           currentInput, info);
       op.getResult().replaceAllUsesWith(currentInput);
@@ -433,11 +493,26 @@ class TTNNDecomposeLayouts
     const LayoutInfo &input = info.input;
     const LayoutInfo &output = info.output;
 
-    /* If we need to untilize and typecast, then untilize and typecast on host
-     */
-    if (info.shouldUntilize()) {
+    // If the output tensor is on host, we can perform the data type cast and
+    // layout conversion on host.
+    if (output.isOnHost()) {
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+          this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter,
+                                                          currentInput, info);
+      op.getResult().replaceAllUsesWith(currentInput);
+      return;
+    }
+
+    // Untilize is only supported on the host, and typecast is only supported on
+    // the device for tilized tensors. Therefore, we need to untilize and change
+    // the tensor data type format on the host before moving the tensor to the
+    // device.
+    if (info.shouldUntilize()) {
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput =
           this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info);
       currentInput =
@@ -448,26 +523,29 @@ class TTNNDecomposeLayouts
       return;
     }
 
-    /* If we need to tilize and the input datatype is bfloat16
-       we can tilize on device and then typecast afterwards */
-    if (info.shouldTilize() and input.dataType == DataType::BFloat16) {
+    // If we need to tilize and change the data type from bf16 to another
+    // format, we can move the tensor to the device, perform the tilization, and
+    // then cast the data type on the device since tilization is supported for
+    // bf16 on the device.
+    if (info.shouldTilize() && input.dataType == DataType::BFloat16) {
       currentInput =
           this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info);
       currentInput =
           this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info);
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter,
                                                           currentInput, info);
       op.getResult().replaceAllUsesWith(currentInput);
       return;
     }
 
-    /* if we need to tilize and the output data type is bfloat16
-       we can typecast on host and tilize on device */
+    // If we need to tilize and change the data type format from another format
+    // to bf16, we can cast the data type on the host, move the tensor to the
+    // device, and then perform the tilization.
     if (info.shouldTilize() and output.dataType == DataType::BFloat16) {
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput =
           this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info);
       currentInput =
@@ -482,8 +560,8 @@ class TTNNDecomposeLayouts
      * everything on host */
     if (info.shouldTilize() and input.dataType != DataType::BFloat16 and
         output.dataType != DataType::BFloat16) {
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput =
           this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info);
       currentInput =
@@ -502,17 +580,17 @@ class TTNNDecomposeLayouts
                                        mlir::Value currentInput,
                                        const OpCreationInfo &info) const {
     const OpsToCreate &opsToCreate = info.opsToCreate;
-    if (not opsToCreate.createToLayoutOp and not opsToCreate.createTypecastOp) {
+    if (!opsToCreate.createToLayoutOp && !opsToCreate.createDataTypeCastOp) {
       return handleHostInputNoLayoutNoTypecast(op, rewriter, currentInput,
                                                info);
     }
-    if (opsToCreate.createToLayoutOp and not opsToCreate.createTypecastOp) {
+    if (opsToCreate.createToLayoutOp && !opsToCreate.createDataTypeCastOp) {
       return handleHostInputLayoutNoTypecast(op, rewriter, currentInput, info);
     }
-    if (not opsToCreate.createToLayoutOp and opsToCreate.createTypecastOp) {
+    if (!opsToCreate.createToLayoutOp && opsToCreate.createDataTypeCastOp) {
       return handleHostInputNoLayoutTypecast(op, rewriter, currentInput, info);
     }
-    if (opsToCreate.createToLayoutOp and opsToCreate.createTypecastOp) {
+    if (opsToCreate.createToLayoutOp && opsToCreate.createDataTypeCastOp) {
       return handleHostInputLayoutTypecast(op, rewriter, currentInput, info);
     }
     llvm_unreachable("Unreachable code path");
@@ -630,8 +708,8 @@ class TTNNDecomposeLayouts
 
     /* If the output is tilized, typecast directly on device*/
     if (output.isTilized()) {
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter,
                                                           currentInput, info);
       currentInput =
@@ -644,8 +722,8 @@ class TTNNDecomposeLayouts
     if (not output.isTilized() and opsToCreate.createFromDeviceOp) {
       currentInput =
           this->createFromDeviceOpIfNeeded(op, rewriter, currentInput, info);
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       op.getResult().replaceAllUsesWith(currentInput);
       return;
     }
@@ -656,8 +734,8 @@ class TTNNDecomposeLayouts
       currentInput =
           this->createOp<ttnn::FromDeviceOp>(op, rewriter, currentInput);
       // typecast on host
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       // move back to device and convert memory config if needed
       currentInput = this->createOp<ttnn::ToDeviceOp>(
           op, rewriter, currentInput, info.device,
@@ -680,8 +758,8 @@ class TTNNDecomposeLayouts
 
     /* If we need to untilize, typecast on device and untilize on host */
     if (info.shouldUntilize() and opsToCreate.createFromDeviceOp) {
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput =
           this->createFromDeviceOpIfNeeded(op, rewriter, currentInput, info);
       currentInput =
@@ -694,8 +772,8 @@ class TTNNDecomposeLayouts
      * host, move back to device */
     if (info.shouldUntilize() and not opsToCreate.createFromDeviceOp) {
       // typecast on device
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       // Force-create a FromDeviceOp
       currentInput = this->createFromDeviceOpIfNeeded(
           op, rewriter, currentInput, info, true /* forceCreate */);
@@ -714,8 +792,8 @@ class TTNNDecomposeLayouts
     if (info.shouldTilize() and input.dataType == DataType::BFloat16) {
       currentInput =
           this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info);
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter,
                                                           currentInput, info);
       currentInput =
@@ -732,8 +810,8 @@ class TTNNDecomposeLayouts
           this->createFromDeviceOpIfNeeded(op, rewriter, currentInput, info);
       currentInput =
           this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info);
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       op.getResult().replaceAllUsesWith(currentInput);
       return;
     }
@@ -753,8 +831,8 @@ class TTNNDecomposeLayouts
       currentInput = this->createOp<ttnn::ToDeviceOp>(
           op, rewriter, currentInput, info.device,
           /* optional MemConfigAttr */ nullptr);
-      currentInput =
-          this->createTypecastOpIfNeeded(op, rewriter, currentInput, info);
+      currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter,
+                                                           currentInput, info);
       currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter,
                                                           currentInput, info);
       op.getResult().replaceAllUsesWith(currentInput);
@@ -769,19 +847,20 @@ class TTNNDecomposeLayouts
                                          mlir::Value currentInput,
                                          const OpCreationInfo &info) const {
     const OpsToCreate &opsToCreate = info.opsToCreate;
-    if (not opsToCreate.createToLayoutOp and not opsToCreate.createTypecastOp) {
+    if (not opsToCreate.createToLayoutOp and
+        not opsToCreate.createDataTypeCastOp) {
       handleDeviceInputNoLayoutNoTypecast(op, rewriter, currentInput, info);
       return;
     }
-    if (opsToCreate.createToLayoutOp and not opsToCreate.createTypecastOp) {
+    if (opsToCreate.createToLayoutOp and not opsToCreate.createDataTypeCastOp) {
       handleDeviceInputLayoutNoTypecast(op, rewriter, currentInput, info);
       return;
     }
-    if (not opsToCreate.createToLayoutOp and opsToCreate.createTypecastOp) {
+    if (not opsToCreate.createToLayoutOp and opsToCreate.createDataTypeCastOp) {
       handleDeviceInputNoLayoutTypecast(op, rewriter, currentInput, info);
       return;
     }
-    if (opsToCreate.createToLayoutOp and opsToCreate.createTypecastOp) {
+    if (opsToCreate.createToLayoutOp and opsToCreate.createDataTypeCastOp) {
       handleDeviceInputLayoutTypecast(op, rewriter, currentInput, info);
       return;
     }
diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
index d45bea5636..4538e6d774 100644
--- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp
+++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -225,6 +225,18 @@ createOp(FlatbufferObjectCache &cache, ToLayoutOp op) {
       device ? cache.at<::tt::target::DeviceRef>(device) : 0, output);
 }
 
+::flatbuffers::Offset<::tt::target::ttnn::ToDTypeOp>
+createOp(FlatbufferObjectCache &cache, ToDTypeOp op) {
+  auto input =
+      cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getInput()));
+  ::tt::target::DataType dtype =
+      ::tt::mlir::ttnn::utils::toTargetDataType(op.getDtype());
+  auto output = cache.getOrCreate(op.getResult(), tensorValueToFlatbuffer,
+                                  kHostAllocatedAddress, kHostAllocatedSize);
+
+  return ::tt::target::ttnn::CreateToDTypeOp(*cache.fbb, input, dtype, output);
+}
+
 ::flatbuffers::Offset<::tt::target::ttnn::TypecastOp>
 createOp(FlatbufferObjectCache &cache, TypecastOp op) {
   auto input =
@@ -1032,6 +1044,10 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op,
     return createOperation(cache, createOp(cache, toLayoutOp), debugString,
                            locInfo);
   }
+  if (auto toDTypeOp = dyn_cast<ToDTypeOp>(op); toDTypeOp) {
+    return createOperation(cache, createOp(cache, toDTypeOp), debugString,
+                           locInfo);
+  }
   if (auto typecastOp = dyn_cast<TypecastOp>(op); typecastOp) {
     return createOperation(cache, createOp(cache, typecastOp), debugString,
                            locInfo);
diff --git a/runtime/include/tt/runtime/detail/workarounds.h b/runtime/include/tt/runtime/detail/workarounds.h
index 50a12ea108..e33def97ad 100644
--- a/runtime/include/tt/runtime/detail/workarounds.h
+++ b/runtime/include/tt/runtime/detail/workarounds.h
@@ -17,12 +17,12 @@ struct Env {
 #endif
   get(bool maxpool2dPreshard = true, bool swapBinaryOperands = true,
       bool readUpdateIndexFromDeviceForKVCache = true,
-      bool toDtypeOnHost = true, bool defaultStrideComputation = true)
+      bool defaultStrideComputation = true)
 #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1
       ;
 #else
   {
-    return Env(true, true, true, true, true);
+    return Env(true, true, true, true);
   }
 #endif
   // TODO(bug #855): Ideally we should have an op that preshards for maxpool2d
@@ -40,11 +40,6 @@ struct Env {
   // to be able to pluck this update index from a runtime tensor.
   bool readUpdateIndexFromDeviceForKVCache;
 
-  // TODO(bug #1658): We're currently use ttnn::to_dtype operation to cast the
-  // data type of a tensor on host. Once we have improved the typecast operation
-  // to handle this, we should remove this workaround.
-  bool toDtypeOnHost;
-
   // TODO(bug #2045): Our current stride calculation is incorrect for tilized
   // tensors. The current solution is to remove stride entirely from the
   // flatbuffer and calculate the stride in runtime assuming using the default
@@ -54,13 +49,12 @@ struct Env {
 
 private:
   constexpr Env(bool maxpool2dPreshard, bool swapBinaryOperands,
-                bool readUpdateIndexFromDeviceForKVCache, bool toDtypeOnHost,
+                bool readUpdateIndexFromDeviceForKVCache,
                 bool defaultStrideComputation)
       : maxpool2dPreshard(maxpool2dPreshard),
         swapBinaryOperands(swapBinaryOperands),
         readUpdateIndexFromDeviceForKVCache(
             readUpdateIndexFromDeviceForKVCache),
-        toDtypeOnHost(toDtypeOnHost),
         defaultStrideComputation(defaultStrideComputation) {}
 };
 
@@ -73,8 +67,6 @@ inline std::ostream &operator<<(std::ostream &os, const Env &env) {
   os << "\t"
      << "readUpdateIndexFromDeviceForKVCache: "
      << env.readUpdateIndexFromDeviceForKVCache << "\n";
-  os << "\t"
-     << "toDtypeOnHost: " << env.toDtypeOnHost << "\n";
   os << "\t"
      << "defaultStrideComputation: " << env.defaultStrideComputation << "\n";
   os << "}";
diff --git a/runtime/lib/common/workarounds.cpp b/runtime/lib/common/workarounds.cpp
index 5396b8a8c1..bc0481ff62 100644
--- a/runtime/lib/common/workarounds.cpp
+++ b/runtime/lib/common/workarounds.cpp
@@ -8,9 +8,9 @@ namespace tt::runtime::workaround {
 #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1
 const Env &Env::get(bool maxpool2dPreshard, bool swapBinaryOperands,
                     bool readUpdateIndexFromDeviceForKVCache,
-                    bool toDtypeOnHost, bool defaultStrideComputation) {
+                    bool defaultStrideComputation) {
   static const Env config(maxpool2dPreshard, swapBinaryOperands,
-                          readUpdateIndexFromDeviceForKVCache, toDtypeOnHost,
+                          readUpdateIndexFromDeviceForKVCache,
                           defaultStrideComputation);
   return config;
 }
diff --git a/runtime/lib/ttnn/operations/CMakeLists.txt b/runtime/lib/ttnn/operations/CMakeLists.txt
index 29d3ab8255..87be4740c4 100644
--- a/runtime/lib/ttnn/operations/CMakeLists.txt
+++ b/runtime/lib/ttnn/operations/CMakeLists.txt
@@ -32,6 +32,7 @@ set(TTNN_OPS_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_device.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/layout/from_device.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_layout.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_dtype.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/layout/typecast.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_memory_config.cpp
   # ANCHOR: adding_an_op_matmul_runtime_cmake
diff --git a/runtime/lib/ttnn/operations/layout/to_dtype.cpp b/runtime/lib/ttnn/operations/layout/to_dtype.cpp
new file mode 100644
index 0000000000..4b69bd5846
--- /dev/null
+++ b/runtime/lib/ttnn/operations/layout/to_dtype.cpp
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "operations/layout/to_dtype.h"
+#include "tt/runtime/ttnn/utils.h"
+
+namespace tt::runtime::ttnn::operations::layout {
+
+void run(const ::tt::target::ttnn::ToDTypeOp *op, ProgramContext &context) {
+  ProgramTensorPool &tensorPool = context.getTensorPool();
+  const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id());
+
+  ::ttnn::DataType targetDataType =
+      ::tt::runtime::ttnn::utils::toTTNNDataType(op->dtype());
+
+  ::ttnn::Tensor out = ::ttnn::to_dtype(inputTensor, targetDataType);
+
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
+}
+} // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/layout/to_dtype.h b/runtime/lib/ttnn/operations/layout/to_dtype.h
new file mode 100644
index 0000000000..91a110979a
--- /dev/null
+++ b/runtime/lib/ttnn/operations/layout/to_dtype.h
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef RUNTIME_LIB_TTNN_OPERATIONS_LAYOUT_TO_DTYPE_H
+#define RUNTIME_LIB_TTNN_OPERATIONS_LAYOUT_TO_DTYPE_H
+
+#include "tt/runtime/ttnn/types.h"
+#include "ttmlir/Target/TTNN/program_generated.h"
+
+namespace tt::runtime::ttnn::operations::layout {
+void run(const ::tt::target::ttnn::ToDTypeOp *op, ProgramContext &context);
+} // namespace tt::runtime::ttnn::operations::layout
+
+#endif
diff --git a/runtime/lib/ttnn/operations/layout/typecast.cpp b/runtime/lib/ttnn/operations/layout/typecast.cpp
index a76f5a2987..f46299fd18 100644
--- a/runtime/lib/ttnn/operations/layout/typecast.cpp
+++ b/runtime/lib/ttnn/operations/layout/typecast.cpp
@@ -18,13 +18,7 @@ void run(const ::tt::target::ttnn::TypecastOp *op, ProgramContext &context) {
   ::ttnn::DataType targetDataType =
       ::tt::runtime::ttnn::utils::toTTNNDataType(op->dtype());
 
-  ::ttnn::Tensor out;
-  if (workaround::Env::get().toDtypeOnHost &&
-      ::tt::runtime::ttnn::utils::isOnHost(inputTensor.storage_type())) {
-    out = ::ttnn::to_dtype(inputTensor, targetDataType);
-  } else {
-    out = ::ttnn::typecast(inputTensor, targetDataType);
-  }
+  ::ttnn::Tensor out = ::ttnn::typecast(inputTensor, targetDataType);
 
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index 9de6ddb009..44b1459a7d 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -30,6 +30,7 @@
 #include "operations/kv_cache/update_cache.h"
 #include "operations/layout/from_device.h"
 #include "operations/layout/to_device.h"
+#include "operations/layout/to_dtype.h"
 #include "operations/layout/to_layout.h"
 #include "operations/layout/to_memory_config.h"
 #include "operations/layout/typecast.h"
@@ -163,6 +164,9 @@ void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) {
   case ::tt::target::ttnn::OpType::ToLayoutOp: {
     return operations::layout::run(op->type_as_ToLayoutOp(), context);
   }
+  case ::tt::target::ttnn::OpType::ToDTypeOp: {
+    return operations::layout::run(op->type_as_ToDTypeOp(), context);
+  }
   case ::tt::target::ttnn::OpType::TypecastOp: {
     return operations::layout::run(op->type_as_TypecastOp(), context);
   }
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index 886d5e558d..92d5e59e7c 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -90,6 +90,12 @@ def set_system_desc_features(system_desc):
 
 config.substitutions.append(("%ttmlir_libs", config.ttmlir_libs_dir))
 
+config.test_root = os.path.join(config.ttmlir_source_dir, "test")
+config.scripts_root = os.path.join(config.ttmlir_source_dir, "tools/scripts")
+
+config.substitutions.append(("%ttmlir_test_root", config.test_root))
+config.substitutions.append(("%ttmlir_scripts_root", config.scripts_root))
+
 # Tweak the PATH to include the tools dir.
 llvm_config.with_environment("PATH", config.llvm_tools_dir, append_path=True)
 
diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in
index 7c3b1928a3..bca25b3e01 100644
--- a/test/lit.site.cfg.py.in
+++ b/test/lit.site.cfg.py.in
@@ -5,6 +5,7 @@ config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
 config.mlir_obj_dir = "@MLIR_BINARY_DIR@"
 config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@ and "@TTMLIR_ENABLE_BINDINGS_PYTHON@" == "ON"
 config.ttmlir_obj_root = "@TTMLIR_BINARY_DIR@"
+config.ttmlir_source_dir = "@TTMLIR_SOURCE_DIR@"
 config.llvm_shlib_ext = "@SHLIBEXT@"
 config.enable_stablehlo = "@TTMLIR_ENABLE_STABLEHLO@" and "@TTMLIR_ENABLE_STABLEHLO@" == "ON"
 config.enable_pykernel = "@TTMLIR_ENABLE_PYKERNEL@" and "@TTMLIR_ENABLE_PYKERNEL@" == "ON"
diff --git a/test/ttmlir/Dialect/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir b/test/ttmlir/Dialect/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir
new file mode 100644
index 0000000000..93fc61a72d
--- /dev/null
+++ b/test/ttmlir/Dialect/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir
@@ -0,0 +1,237 @@
+// RUN: ttmlir-opt --ttnn-decompose-layouts %s | FileCheck %s
+#device = #tt.device<workerGrid = #tt.grid<8x8, (d0, d1) -> (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]>
+#system_desc = #tt.system_desc<[{role = host, target_triple = "x86_64-pc-linux"}], [{arch = <wormhole_b0>, grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 99104, erisc_l1_unreserved_base = 104480, dram_unreserved_base = 32, dram_unreserved_end = 1073196736, physical_cores = {worker = [ 18x18,  18x19,  18x20,  18x21,  18x22,  18x23,  18x24,  18x25,  19x18,  19x19,  19x20,  19x21,  19x22,  19x23,  19x24,  19x25,  20x18,  20x19,  20x20,  20x21,  20x22,  20x23,  20x24,  20x25,  21x18,  21x19,  21x20,  21x21,  21x22,  21x23,  21x24,  21x25,  22x18,  22x19,  22x20,  22x21,  22x22,  22x23,  22x24,  22x25,  23x18,  23x19,  23x20,  23x21,  23x22,  23x23,  23x24,  23x25,  24x18,  24x19,  24x20,  24x21,  24x22,  24x23,  24x24,  24x25,  25x18,  25x19,  25x20,  25x21,  25x22,  25x23,  25x24,  25x25] dram = [ 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0x10,  0x11] eth = [ 17x25] eth_inactive = [ 16x18,  16x19,  16x20,  16x21,  16x22,  16x23,  16x24,  16x25,  17x19,  17x20,  17x22,  17x23,  17x24]}, supported_data_types = [<f32>, <f16>, <bf16>, <bfp_f8>, <bfp_bf8>, <bfp_f4>, <bfp_bf4>, <bfp_f2>, <bfp_bf2>, <u32>, <u16>, <u8>], supported_tile_sizes = [ 4x16,  16x16,  32x16,  4x32,  16x32,  32x32], num_cbs = 32}, {arch = <wormhole_b0>, grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 99104, erisc_l1_unreserved_base = 104480, dram_unreserved_base = 32, dram_unreserved_end = 1073196736, physical_cores = {worker = [ 18x18,  18x19,  18x20,  18x21,  18x22,  18x23,  18x24,  18x25,  19x18,  19x19,  19x20,  19x21,  19x22,  19x23,  19x24,  19x25,  20x18,  20x19,  20x20,  20x21,  20x22,  20x23,  20x24,  20x25,  21x18,  21x19,  21x20,  21x21,  21x22,  21x23,  21x24,  21x25,  22x18,  22x19,  22x20,  22x21,  22x22,  22x23,  22x24,  22x25,  23x18,  23x19,  23x20,  23x21,  23x22,  23x23,  23x24,  23x25,  24x18,  24x19,  24x20,  24x21,  24x22,  24x23,  24x24,  24x25,  25x18,  25x19,  25x20,  25x21,  25x22,  25x23,  25x24,  25x25] dram = [ 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0x10,  0x11] eth = [ 16x25] eth_inactive = [ 16x19,  16x20,  16x21,  16x22,  16x23,  16x24,  17x18,  17x19,  17x20,  17x21,  17x22,  17x23,  17x24,  17x25]}, supported_data_types = [<f32>, <f16>, <bf16>, <bfp_f8>, <bfp_bf8>, <bfp_f4>, <bfp_bf4>, <bfp_f2>, <bfp_bf2>, <u32>, <u16>, <u8>], supported_tile_sizes = [ 4x16,  16x16,  32x16,  4x32,  16x32,  32x32], num_cbs = 32}], [0, 1], [3 : i32, 0 : i32], [ 0x0x0x0], [<[0, 8, 0], [1, 0, 0]>]>
+#dram = #ttnn.buffer_type<dram>
+#system_memory = #ttnn.buffer_type<system_memory>
+#ttnn_layout_host_rm = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system_memory>>
+#ttnn_layout_host_rm_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>>
+#ttnn_layout_host_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #system_memory>>
+#ttnn_layout_host_tile_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #system_memory>>
+#ttnn_layout_device_rm = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, <interleaved>>
+#ttnn_layout_device_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, <interleaved>>
+#ttnn_layout_device_tile_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, <interleaved>>
+#ttnn_layout_device_tile_u32 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, u32>, #dram>, <interleaved>>
+#ttnn_layout_device_rm_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #dram>, <interleaved>>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, <interleaved>>
+module attributes {tt.device = #device, tt.system_desc = #system_desc} {
+
+    // Test cases when we do layout transformation from host and we don't change tensor layout and tensor data type
+    //
+
+    // Test case when we move tensor from host to device.
+    func.func @from_host_to_device_layout_to_layout_dt_to_dt_create_to_device_op(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_device_rm> {
+        // Verify that we only insert the to_device op when there are no layout or data type changes.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]])
+        // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>
+        // CHECK: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_rm>
+    }
+
+    // Test cases when we do layout transformation from host and we don't change tensor layout but we cast tensor data type.
+    //
+
+    // Test case when we move tensor from host to host for tile case.
+    func.func @from_host_to_host_layout_to_layout_create_data_cast_op_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_host_tile_bf16> {
+        // Typecast works only on device. Verify that for the tile case when the output is on host, we insert the to_dtype op to cast the data type on host.
+        // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK-NEXT: return %[[CASTING_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>
+    }
+
+    // Test case when we move tensor from host to host for row-major case.
+    func.func @from_host_to_host_layout_to_layout_create_data_cast_op_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_host_rm_bf16> {
+        // Typecast works only on device. Verify that for the row-major case when the output is on host, we insert the to_dtype op to cast the data type on host.
+        // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK-NEXT: return %[[CASTING_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>
+    }
+
+    // Test case when we move tensor from host to device for row-major case.
+    func.func @from_host_to_device_layout_to_layout_create_data_cast_op_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_device_rm_bf16> {
+        // Typecast on device only works for tile layout. Verify that for the row-major case we insert the to_dtype op to cast the data type on host and than move the tensor to device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[CASTING_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_rm_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_device_rm_bf16>
+    }
+
+    // Test case when we move tensor from host to device for tile case.
+    func.func @from_host_to_device_layout_to_layout_create_data_cast_op_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> {
+        // Typecast on device only works for tile layout. Verify that for the tile case we insert the to_device op and the typecast op to cast the data type on device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]])
+        // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.typecast"(%[[TO_DEVICE_OP]])
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK-NEXT: return %[[CASTING_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+    }
+
+    // Test cases when we do layout transformation from host and we change tensor layout but we don't cast tensor data type.
+    //
+
+    // Test case when we move tensor from host to host for tile -> row-major case.
+    func.func @from_host_to_host_dt_to_dt_from_tile_to_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_host_rm> {
+        // This test verifies that the `to_layout` operation is correctly inserted to change the layout from tile to row-major on the host.
+        // CHECK: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0)
+        // CHECK-SAME: layout = #ttnn.layout<row_major>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_host_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_host_rm>
+    }
+
+    // Test case when we move tensor from host to host for row-major -> tile case.
+    func.func @from_host_to_host_dt_to_dt_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_host_tile> {
+        // This test verifies that the `to_layout` operation is correctly inserted to change the layout from row-major to tile on the host.
+        // CHECK: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0)
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_host_tile>
+        return %1 : tensor<64x128xf32, #ttnn_layout_host_tile>
+    }
+
+    // Test case when we move tensor from host to device for tile -> row-major case.
+    func.func @from_host_to_device_dt_to_dt_from_tile_to_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_device_rm> {
+        // This test verifies that the `to_layout` and `to_device` operations are correctly inserted to change the layout from tile to row-major on the host and than move the tensor to the device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0)
+        // CHECK-SAME: layout = #ttnn.layout<row_major>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_rm>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case for bf16 data type.
+    func.func @from_host_to_device_dt_to_dt_from_rm_to_tile_bf16(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> {
+        // This test verifies that the `to_device` and `to_layout` operations are correctly inserted to change the layout from row-major to tile on the device.
+        // Specifically, it ensures that BF16 tiling is performed on the device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case for non-bf16 data type.
+    func.func @from_host_to_device_dt_to_dt_from_rm_to_tile_f32(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_device_tile> {
+        // This test verifies that the `to_layout` and `to_device` operations are correctly inserted to change the layout from row-major to tile on the host for non bf16 data type.
+        // Specifically, it ensures that non-BF16 tiling is performed on the host and then moved to the device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0)
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_tile>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_tile>
+    }
+
+    // Test cases when we do layout transformation from host and we change both tensor layout and tensor data type.
+    //
+
+    // Test case when we move tensor from host to host for tile -> row-major case and data type cast.
+    func.func @from_host_to_host_from_bf16_to_f32_from_tile_to_rm(%arg0: tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_rm> {
+        // This test verifies that the `to_layout` and `to_dtype` operations are correctly inserted to change the layout from tile to row-major and cast data type from bf16 to f32 on host.
+        // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<f32>
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<row_major>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_host_rm>
+    }
+
+    // Test case when we move tensor from host to host for row-major -> tile case and data type cast.
+    func.func @from_host_to_host_from_bf16_to_f32_from_rm_to_tile(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_tile> {
+        // This test verifies that the `to_layout` and `to_dtype` operations are correctly inserted to change the layout from row-major to tile and cast data type from bf16 to f32 on host.
+        // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<f32>
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_tile>
+        return %1 : tensor<64x128xf32, #ttnn_layout_host_tile>
+    }
+
+    // Test case when we move tensor from host to device for tile -> row-major case and cast input from bf16.
+    func.func @from_host_to_device_data_type_from_bf16_to_f32_from_tile_to_rm(%arg0: tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_device_rm> {
+        // This test verifies that the `to_dtype`, `to_layout` and `to_device` operations are correctly inserted to change the layout from tile to row-major and cast data type from bf16 to f32 on host and then move tensor to device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<f32>
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<row_major>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_rm>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case and cast input from bf16.
+    func.func @from_host_to_device_data_type_from_bf16_to_f32_from_rm_to_tile(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_device_tile> {
+        // This test verifies that the `to_device`, `to_layout` and `typecast` operations are correctly inserted to change the layout from row-major to tile and cast
+        // data type from bf16 to f32 on device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.typecast"(%[[TO_LAYOUT_OP]])
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<f32>
+        // CHECK-NEXT: return %[[CASTING_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_tile>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_tile>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case and cast input to bf16.
+    func.func @from_host_to_device_data_type_from_f32_to_bf16_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> {
+        // This test verifies that the `to_dtype`, `to_device` and `to_layout` operations are correctly inserted to cast the data type from f32 to bf16 on host and then move tensor to device and change the layout from row-major to tile.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[CASTING_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case and we don't cast data type to bf16 nor from bf16.
+    func.func @from_host_to_device_data_type_from_f32_to_u32_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xi32, #ttnn_layout_device_tile_u32> {
+        // This test verifies that the `to_dtype`, `to_layout` and `to_device` operations are correctly inserted to cast the data type from f32 to f16 and tilize on host and then move tensor to device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<u32>
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<u32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xi32, #ttnn_layout_device_tile_u32>
+        return %1 : tensor<64x128xi32, #ttnn_layout_device_tile_u32>
+    }
+}
diff --git a/test/ttmlir/Dialect/TTNN/simple_clamp.mlir b/test/ttmlir/Dialect/TTNN/simple_clamp.mlir
index 272e07175b..f341792c74 100644
--- a/test/ttmlir/Dialect/TTNN/simple_clamp.mlir
+++ b/test/ttmlir/Dialect/TTNN/simple_clamp.mlir
@@ -2,8 +2,9 @@
 module attributes {} {
   func.func @clamp(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
     %0 = tensor.empty() : tensor<64x128xbf16>
+    // CHECK: %[[GET_DEVICE:.*]] = "ttnn.get_device"()
     // CHECK: %[[DEVICE:.*]] = "ttnn.to_device"(%arg0,
-    // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]])
+    // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]], %[[GET_DEVICE]])
     // CHECK: = "ttnn.clamp"(%[[LAYOUT]])
     // CHECK-SAME: {max = 3.000000e+00 : f32, min = 2.000000e+00 : f32}
     // CHECK-SAME: [[TENSOR:tensor<64x128xbf16]], #ttnn_layout{{[0-9]+}}>) -> [[TENSOR]]
diff --git a/test/ttmlir/EmitC/TTNN/other/embedding.mlir b/test/ttmlir/EmitC/TTNN/other/embedding.mlir
index b7a42638ac..627b514b06 100644
--- a/test/ttmlir/EmitC/TTNN/other/embedding.mlir
+++ b/test/ttmlir/EmitC/TTNN/other/embedding.mlir
@@ -3,9 +3,6 @@
 // RUN: ttmlir-opt --ttnn-modify-signatures-for-dylib --convert-ttnn-to-emitc %t.mlir > %t2.mlir
 // RUN: ttmlir-translate --mlir-to-cpp %t2.mlir > %basename_t.cpp
 
-// UNSUPPORTED: true
-// Outstanding bug: https://github.com/tenstorrent/tt-mlir/issues/1938
-
 func.func @embedding(%arg0: tensor<32x32xbf16>, %arg1: tensor<512x128xbf16>) -> tensor<32x32x128xbf16> {
   %0 = tensor.empty() : tensor<32x32x128xbf16>
   %1 = "ttir.embedding"(%arg0, %arg1, %0) : (tensor<32x32xbf16>, tensor<512x128xbf16>, tensor<32x32x128xbf16>) -> tensor<32x32x128xbf16>
diff --git a/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir b/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir
new file mode 100644
index 0000000000..07f60229c9
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir
@@ -0,0 +1,4 @@
+// RUN: ttmlir-opt %s
+// UNSUPPORTED: true
+module {
+}
diff --git a/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir b/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir
new file mode 100644
index 0000000000..b2416fc1ee
--- /dev/null
+++ b/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir
@@ -0,0 +1,240 @@
+// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device %ttmlir_test_root/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir > %t.mlir
+// RUN: python %ttmlir_scripts_root/extract-and-replace-system-desc-and-device.py %t.mlir %s > %t_replaced.mlir
+// RUN: ttmlir-opt --ttnn-decompose-layouts %t_replaced.mlir > %t_ttnn_mlir.mlir
+// RUN: FileCheck %t_replaced.mlir --input-file=%t_ttnn_mlir.mlir
+// RUN: ttmlir-translate --ttnn-to-flatbuffer %t_ttnn_mlir.mlir > %t.ttnn
+#device = #tt.device<>
+#system_desc = #tt.system_desc<>
+#dram = #ttnn.buffer_type<dram>
+#system_memory = #ttnn.buffer_type<system_memory>
+#ttnn_layout_host_rm = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system_memory>>
+#ttnn_layout_host_rm_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>>
+#ttnn_layout_host_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #system_memory>>
+#ttnn_layout_host_tile_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #system_memory>>
+#ttnn_layout_device_rm = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, <interleaved>>
+#ttnn_layout_device_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, <interleaved>>
+#ttnn_layout_device_tile_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, <interleaved>>
+#ttnn_layout_device_tile_u32 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, u32>, #dram>, <interleaved>>
+#ttnn_layout_device_rm_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #dram>, <interleaved>>
+#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>>
+#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, <interleaved>>
+module attributes {tt.device = #device, tt.system_desc = #system_desc} {
+    // Test cases when we do layout transformation from host and we don't change tensor layout and tensor data type
+    //
+
+    // Test case when we move tensor from host to device.
+    func.func @from_host_to_device_layout_to_layout_dt_to_dt_create_to_device_op(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_device_rm> {
+        // Verify that we only insert the to_device op when there are no layout or data type changes.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]])
+        // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>
+        // CHECK: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_rm>
+    }
+
+    // Test cases when we do layout transformation from host and we don't change tensor layout but we cast tensor data type.
+    //
+
+    // Test case when we move tensor from host to host for tile case.
+    func.func @from_host_to_host_layout_to_layout_create_data_cast_op_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_host_tile_bf16> {
+        // Typecast works only on device. Verify that for the tile case when the output is on host, we insert the to_dtype op to cast the data type on host.
+        // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK-NEXT: return %[[CASTING_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>
+    }
+
+    // Test case when we move tensor from host to host for row-major case.
+    func.func @from_host_to_host_layout_to_layout_create_data_cast_op_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_host_rm_bf16> {
+        // Typecast works only on device. Verify that for the row-major case when the output is on host, we insert the to_dtype op to cast the data type on host.
+        // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK-NEXT: return %[[CASTING_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>
+    }
+
+    // Test case when we move tensor from host to device for row-major case.
+    func.func @from_host_to_device_layout_to_layout_create_data_cast_op_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_device_rm_bf16> {
+        // Typecast on device only works for tile layout. Verify that for the row-major case we insert the to_dtype op to cast the data type on host and than move the tensor to device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[CASTING_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_rm_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_device_rm_bf16>
+    }
+
+    // Test case when we move tensor from host to device for tile case.
+    func.func @from_host_to_device_layout_to_layout_create_data_cast_op_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> {
+        // Typecast on device only works for tile layout. Verify that for the tile case we insert the to_device op and the typecast op to cast the data type on device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]])
+        // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.typecast"(%[[TO_DEVICE_OP]])
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK-NEXT: return %[[CASTING_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+    }
+
+    // Test cases when we do layout transformation from host and we change tensor layout but we don't cast tensor data type.
+    //
+
+    // Test case when we move tensor from host to host for tile -> row-major case.
+    func.func @from_host_to_host_dt_to_dt_from_tile_to_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_host_rm> {
+        // This test verifies that the `to_layout` operation is correctly inserted to change the layout from tile to row-major on the host.
+        // CHECK: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0)
+        // CHECK-SAME: layout = #ttnn.layout<row_major>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_host_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_host_rm>
+    }
+
+    // Test case when we move tensor from host to host for row-major -> tile case.
+    func.func @from_host_to_host_dt_to_dt_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_host_tile> {
+        // This test verifies that the `to_layout` operation is correctly inserted to change the layout from row-major to tile on the host.
+        // CHECK: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0)
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_host_tile>
+        return %1 : tensor<64x128xf32, #ttnn_layout_host_tile>
+    }
+
+    // Test case when we move tensor from host to device for tile -> row-major case.
+    func.func @from_host_to_device_dt_to_dt_from_tile_to_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_device_rm> {
+        // This test verifies that the `to_layout` and `to_device` operations are correctly inserted to change the layout from tile to row-major on the host and than move the tensor to the device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0)
+        // CHECK-SAME: layout = #ttnn.layout<row_major>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_rm>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case for bf16 data type.
+    func.func @from_host_to_device_dt_to_dt_from_rm_to_tile_bf16(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> {
+        // This test verifies that the `to_device` and `to_layout` operations are correctly inserted to change the layout from row-major to tile on the device.
+        // Specifically, it ensures that BF16 tiling is performed on the device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case for non-bf16 data type.
+    func.func @from_host_to_device_dt_to_dt_from_rm_to_tile_f32(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_device_tile> {
+        // This test verifies that the `to_layout` and `to_device` operations are correctly inserted to change the layout from row-major to tile on the host for non bf16 data type.
+        // Specifically, it ensures that non-BF16 tiling is performed on the host and then moved to the device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0)
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_tile>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_tile>
+    }
+
+    // Test cases when we do layout transformation from host and we change both tensor layout and tensor data type.
+    //
+
+    // Test case when we move tensor from host to host for tile -> row-major case and data type cast.
+    func.func @from_host_to_host_from_bf16_to_f32_from_tile_to_rm(%arg0: tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_rm> {
+        // This test verifies that the `to_layout` and `to_dtype` operations are correctly inserted to change the layout from tile to row-major and cast data type from bf16 to f32 on host.
+        // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<f32>
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<row_major>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_host_rm>
+    }
+
+    // Test case when we move tensor from host to host for row-major -> tile case and data type cast.
+    func.func @from_host_to_host_from_bf16_to_f32_from_rm_to_tile(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_tile> {
+        // This test verifies that the `to_layout` and `to_dtype` operations are correctly inserted to change the layout from row-major to tile and cast data type from bf16 to f32 on host.
+        // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<f32>
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_tile>
+        return %1 : tensor<64x128xf32, #ttnn_layout_host_tile>
+    }
+
+    // Test case when we move tensor from host to device for tile -> row-major case and cast input from bf16.
+    func.func @from_host_to_device_data_type_from_bf16_to_f32_from_tile_to_rm(%arg0: tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_device_rm> {
+        // This test verifies that the `to_dtype`, `to_layout` and `to_device` operations are correctly inserted to change the layout from tile to row-major and cast data type from bf16 to f32 on host and then move tensor to device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<f32>
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<row_major>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<row_major>, memory_config = #ttnn.memory_config<#dram, <<64x128>>, <interleaved>>}> : (tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_rm>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case and cast input from bf16.
+    func.func @from_host_to_device_data_type_from_bf16_to_f32_from_rm_to_tile(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_device_tile> {
+        // This test verifies that the `to_device`, `to_layout` and `typecast` operations are correctly inserted to change the layout from row-major to tile and cast
+        // data type from bf16 to f32 on device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.typecast"(%[[TO_LAYOUT_OP]])
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<f32>
+        // CHECK-NEXT: return %[[CASTING_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<f32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_tile>
+        return %1 : tensor<64x128xf32, #ttnn_layout_device_tile>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case and cast input to bf16.
+    func.func @from_host_to_device_data_type_from_f32_to_bf16_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> {
+        // This test verifies that the `to_dtype`, `to_device` and `to_layout` operations are correctly inserted to cast the data type from f32 to bf16 on host and then move tensor to device and change the layout from row-major to tile.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<bf16>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[CASTING_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: return %[[TO_LAYOUT_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<bf16>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+        return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16>
+    }
+
+    // Test case when we move tensor from host to device for row-major -> tile case and we don't cast data type to bf16 nor from bf16.
+    func.func @from_host_to_device_data_type_from_f32_to_u32_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xi32, #ttnn_layout_device_tile_u32> {
+        // This test verifies that the `to_dtype`, `to_layout` and `to_device` operations are correctly inserted to cast the data type from f32 to f16 and tilize on host and then move tensor to device.
+        // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"()
+        // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0)
+        // CHECK-SAME: dtype = #tt.supportedDataTypes<u32>
+        // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]])
+        // CHECK-SAME: layout = #ttnn.layout<tile>
+        // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]])
+        // CHECK-NEXT: return %[[TO_DEVICE_OP]]
+        %0 = "ttnn.get_device"() <{mesh_shape = #ttnn<mesh_shape 1x1>}> : () -> !tt.device<#device>
+        %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes<u32>, layout = #ttnn.layout<tile>, memory_config = #ttnn.memory_config<#dram, <<2x4>>, <interleaved>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xi32, #ttnn_layout_device_tile_u32>
+        return %1 : tensor<64x128xi32, #ttnn_layout_device_tile_u32>
+    }
+}
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_clamp.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_clamp.mlir
index 44806c22df..5f5239c40b 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_clamp.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_clamp.mlir
@@ -4,8 +4,9 @@
 
 func.func @clamp(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
   %0 = tensor.empty() : tensor<64x128xbf16>
+  // CHECK: %[[GET_DEVICE:.*]] = "ttnn.get_device"()
   // CHECK: %[[DEVICE:.*]] = "ttnn.to_device"(%arg0,
-  // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]])
+  // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]], %[[GET_DEVICE]])
   // CHECK: = "ttnn.clamp"(%[[LAYOUT]])
   // CHECK-SAME: {max = 3.000000e+00 : f32, min = 2.000000e+00 : f32}
   // CHECK-SAME: [[TENSOR:tensor<64x128xbf16]], #ttnn_layout{{[0-9]+}}>) -> [[TENSOR]]
diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
index a0452f01f8..52c99bae48 100644
--- a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
@@ -19,8 +19,9 @@ func.func @ceil(%arg0: tensor<32x32xf32>) -> tensor<32x32xf32> {
 
 func.func @clamp(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
   %0 = tensor.empty() : tensor<64x128xbf16>
+  // CHECK: %[[GET_DEVICE:.*]] = "ttnn.get_device"()
   // CHECK: %[[DEVICE:.*]] = "ttnn.to_device"(%arg0,
-  // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]])
+  // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]], %[[GET_DEVICE]])
   // CHECK: = "ttnn.clamp"(%[[LAYOUT]])
   // CHECK-SAME: {max = 3.000000e+00 : f32, min = 2.000000e+00 : f32}
   // CHECK-SAME: [[TENSOR:tensor<64x128xbf16]], #ttnn_layout{{[0-9]+}}>) -> [[TENSOR]]
diff --git a/tools/scripts/extract-and-replace-system-desc-and-device.py b/tools/scripts/extract-and-replace-system-desc-and-device.py
new file mode 100644
index 0000000000..cddbd2f549
--- /dev/null
+++ b/tools/scripts/extract-and-replace-system-desc-and-device.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import sys
+import os
+
+
+def main(input_file, output_file):
+    # Read content from the input file
+    with open(input_file, "r") as file:
+        system_desc = ""
+        device_desc = ""
+        for line in file:
+            if "#system_desc =" in line:
+                system_desc = line.strip()
+            if "#device =" in line:
+                device_desc = line.strip()
+
+    # Write the modified content to the output file
+    modified_content = ""
+    with open(output_file, "r") as file:
+        for line in file:
+            # print(line)
+            if line.strip().startswith("#device ="):
+                modified_content += device_desc
+            elif line.strip().startswith("#system_desc ="):
+                modified_content += system_desc
+            elif line.strip().startswith("// RUN:"):
+                continue
+            else:
+                modified_content += line
+
+    print(modified_content)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print(
+            "Usage: python extract-and-replace-system-desc-and-device.py <input_file> <output_file>"
+        )
+        sys.exit(1)
+
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+
+    main(input_file, output_file)