diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index ade377c06a..f5e8065ea8 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -437,6 +437,12 @@ jobs:
         source env/activate
         pytest -ssv runtime/tools/python/test/test_perf.py
 
+    - name: ttrt api tests
+      shell: bash
+      run: |
+        source env/activate
+        pytest -ssv runtime/tools/python/test/test_api.py
+
   build-and-test-explorer:
     needs: build-image
     timeout-minutes: 60
diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
index 7a68a7e944..ac04d87a9d 100644
--- a/runtime/include/tt/runtime/detail/ttmetal.h
+++ b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -39,8 +39,8 @@ void closeDevice(Device device);
 
 void deallocateBuffers(Device device);
 
-Event submit(Device device, Binary executable, std::uint32_t programIndex,
-             std::vector<Tensor> const &inputs,
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex, std::vector<Tensor> const &inputs,
              std::vector<Tensor> const &outputs);
 
 void wait(Event event);
diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
index 6c55ac1de7..d76ba67f4b 100644
--- a/runtime/include/tt/runtime/detail/ttnn.h
+++ b/runtime/include/tt/runtime/detail/ttnn.h
@@ -75,11 +75,16 @@ void closeDevice(Device device);
 
 void deallocateBuffers(Device device);
 
-Event submit(Device device, Binary executable, std::uint32_t programIndex,
-             std::vector<Tensor> const &inputs,
-             std::vector<Tensor> const &outputs);
+Tensor toHost(Tensor tensor, bool untilize = false);
 
-void wait(Event event);
+Tensor toDevice(Tensor tensor, Device device);
+
+Tensor toDevice(Tensor tensor, Device device, Layout layout);
+
+Tensor toLayout(Tensor tensor, Layout layout);
+
+Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
+                 std::uint32_t inputIndex);
 
 std::string getOpDebugString(OpContext opContextHandle);
 
@@ -88,10 +93,34 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
 
 std::vector<float> getTensorData(Tensor tensor);
 
+void memcpy(void *dst, Tensor src);
+
+void memcpy(Tensor dst, Tensor src);
+
+void deallocateTensor(Tensor &tensor, bool force = false);
+
+namespace legacy {
+/* Will be deprecated soon once FEs migrate to new API */
+void wait(Event event);
+
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex, std::vector<Tensor> const &inputs,
+             std::vector<Tensor> const &outputs);
+
 void runProgram(::ttnn::MeshDevice &meshDevice, Binary &executableHandle,
                 std::uint32_t programIndex,
                 std::vector<::ttnn::Tensor *> const &inputs,
                 std::vector<::ttnn::Tensor *> const &outputs);
+} // namespace legacy
+
+std::vector<Tensor> submit(Device deviceHandle, Binary executableHandle,
+                           std::uint32_t programIndex,
+                           std::vector<Tensor> const &inputs);
+
+std::vector<Tensor> runProgram(::ttnn::MeshDevice &meshDevice,
+                               Binary executableHandle,
+                               std::uint32_t programIndex,
+                               std::vector<::ttnn::Tensor *> const &inputs);
 
 } // namespace tt::runtime::ttnn
 
diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h
index 1dc721f662..7a8a92aec1 100644
--- a/runtime/include/tt/runtime/runtime.h
+++ b/runtime/include/tt/runtime/runtime.h
@@ -63,12 +63,19 @@ Device openDevice(DeviceIds const &deviceIds, size_t numHWCQs = 1);
 
 void closeDevice(Device device);
 
-Event submit(Device device, Binary executable, std::uint32_t programIndex,
-             std::vector<Tensor> const &inputs,
-             std::vector<Tensor> const &outputs);
-
 void wait(Event event);
 
+Tensor toHost(Tensor tensor, bool untilize = false);
+
+Tensor toDevice(Tensor tensor, Device device);
+
+Tensor toDevice(Tensor tensor, Device device, Layout layout);
+
+Tensor toLayout(Tensor tensor, Layout layout);
+
+Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
+                 std::uint32_t inputIndex);
+
 std::string getOpDebugString(OpContext opContextHandle);
 
 Tensor getOpOutputTensor(OpContext opContextHandle,
@@ -76,6 +83,20 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
 
 std::vector<float> getTensorData(Tensor tensor);
 
+void memcpy(void *dst, Tensor src);
+
+void memcpy(Tensor dst, Tensor src);
+
+void deallocateTensor(Tensor &tensor, bool force = false);
+
+std::vector<Tensor> submit(Device deviceHandle, Binary executableHandle,
+                           std::uint32_t programIndex,
+                           std::vector<Tensor> const &inputs);
+
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex, std::vector<Tensor> const &inputs,
+             std::vector<Tensor> const &outputs);
+
 } // namespace tt::runtime
 
 #endif
diff --git a/runtime/include/tt/runtime/types.h b/runtime/include/tt/runtime/types.h
index 8fd641195f..cc2791e237 100644
--- a/runtime/include/tt/runtime/types.h
+++ b/runtime/include/tt/runtime/types.h
@@ -122,10 +122,20 @@ struct Event : public detail::RuntimeCheckedObjectImpl {
 
 struct Tensor : public detail::RuntimeCheckedObjectImpl {
   std::shared_ptr<void> data;
-
+  Event event;
   Tensor(std::shared_ptr<void> handle, std::shared_ptr<void> data,
          DeviceRuntime runtime)
-      : detail::RuntimeCheckedObjectImpl(handle, runtime), data(data) {}
+      : detail::RuntimeCheckedObjectImpl(handle, runtime), data(data),
+        event(nullptr, runtime) {}
+
+  Tensor(std::shared_ptr<void> handle, std::shared_ptr<void> data,
+         std::shared_ptr<void> eventHandle, DeviceRuntime runtime)
+      : detail::RuntimeCheckedObjectImpl(handle, runtime), data(data),
+        event(eventHandle, runtime) {}
+};
+
+struct Layout : public detail::RuntimeCheckedObjectImpl {
+  using detail::RuntimeCheckedObjectImpl::RuntimeCheckedObjectImpl;
 };
 
 struct CallbackContext : public detail::RuntimeCheckedObjectImpl {
diff --git a/runtime/lib/binary.cpp b/runtime/lib/binary.cpp
index 92be39d27f..b81bf1a770 100644
--- a/runtime/lib/binary.cpp
+++ b/runtime/lib/binary.cpp
@@ -27,15 +27,12 @@ static std::string asJson(void const *fbb, uint8_t const *binarySchema,
   flatbuffers::Parser parser(opts);
 
   if (not parser.Deserialize(binarySchema, schemaSize)) {
-    throw std::runtime_error("Failed to deserialize schema");
+    LOG_FATAL("Failed to deserialize schema");
   }
 
   std::string text;
   const char *err = ::flatbuffers::GenerateText(parser, fbb, &text);
-  if (err) {
-    throw std::runtime_error("Failed to generate JSON: " + std::string(err));
-  }
-
+  LOG_ASSERT(not err, "Failed to generate JSON: ", err);
   return text;
 }
 
@@ -44,9 +41,7 @@ namespace ttnn {
 ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) {
   bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier(
       binary.handle.get());
-  if (not isTTNN) {
-    throw std::runtime_error("Unsupported binary format");
-  }
+  LOG_ASSERT(isTTNN, "Unsupported binary format");
   return ::tt::target::ttnn::GetSizePrefixedTTNNBinary(binary.handle.get());
 }
 
@@ -128,9 +123,7 @@ ::tt::target::metal::TTMetalBinary const *getBinary(Flatbuffer binary) {
   bool isTTMetal =
       ::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
           binary.handle.get());
-  if (not isTTMetal) {
-    throw std::runtime_error("Unsupported binary format");
-  }
+  LOG_ASSERT(isTTMetal, "Unsupported binary format");
   return ::tt::target::metal::GetSizePrefixedTTMetalBinary(binary.handle.get());
 }
 
@@ -207,7 +200,7 @@ namespace system_desc {
 ::tt::target::SystemDescRoot const *getBinary(Flatbuffer binary) {
   if (!::tt::target::SizePrefixedSystemDescRootBufferHasIdentifier(
           binary.handle.get())) {
-    throw std::runtime_error("Unsupported binary format");
+    LOG_FATAL("Unsupported binary format");
   }
   return ::tt::target::GetSizePrefixedSystemDescRoot(binary.handle.get());
 }
@@ -234,10 +227,7 @@ std::string asJson(Flatbuffer binary) {
 Flatbuffer Flatbuffer::loadFromPath(char const *path) {
   // load a flatbuffer from path
   std::ifstream fbb(path, std::ios::binary | std::ios::ate);
-  if (!fbb.is_open()) {
-    throw std::runtime_error("Failed to open file: " + std::string(path));
-  }
-
+  LOG_ASSERT(fbb.is_open(), "Failed to open file: ", path);
   std::streampos size = fbb.tellg();
   fbb.seekg(0, std::ios::beg);
   auto buffer = ::tt::runtime::utils::malloc_shared(size);
@@ -269,7 +259,7 @@ std::string_view Flatbuffer::getFileIdentifier() const {
     return ::tt::target::SystemDescRootIdentifier();
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 std::string Flatbuffer::getVersion() const {
@@ -288,7 +278,7 @@ std::string Flatbuffer::getVersion() const {
     return system_desc::getVersion(*this);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 std::string_view Flatbuffer::getTTMLIRGitHash() const {
@@ -307,7 +297,7 @@ std::string_view Flatbuffer::getTTMLIRGitHash() const {
     return system_desc::getTTMLIRGitHash(*this);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 std::string Flatbuffer::asJson() const {
@@ -326,7 +316,7 @@ std::string Flatbuffer::asJson() const {
     return system_desc::asJson(*this);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 SystemDesc SystemDesc::loadFromPath(char const *path) {
@@ -349,7 +339,7 @@ Binary::getProgramInputs(std::uint32_t programIndex) const {
     return metal::getProgramInputs(*this, programIndex);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 std::vector<TensorDesc>
@@ -364,7 +354,7 @@ Binary::getProgramOutputs(std::uint32_t programIndex) const {
     return metal::getProgramOutputs(*this, programIndex);
   }
 
-  throw std::runtime_error("Unsupported binary format");
+  LOG_FATAL("Unsupported binary format");
 }
 
 const ::tt::target::GoldenTensor *
diff --git a/runtime/lib/common/system_desc.cpp b/runtime/lib/common/system_desc.cpp
index f1210d00aa..3b4685901a 100644
--- a/runtime/lib/common/system_desc.cpp
+++ b/runtime/lib/common/system_desc.cpp
@@ -32,7 +32,7 @@ static ::tt::target::Arch toFlatbuffer(::tt::ARCH arch) {
     break;
   }
 
-  throw std::runtime_error("Unsupported arch");
+  LOG_FATAL("Unsupported arch");
 }
 
 static std::vector<::tt::target::ChipChannel>
@@ -246,7 +246,7 @@ static std::unique_ptr<::tt::runtime::SystemDesc> getCurrentSystemDescImpl(
   ::tt::target::FinishSizePrefixedSystemDescRootBuffer(fbb, root);
   ::flatbuffers::Verifier verifier(fbb.GetBufferPointer(), fbb.GetSize());
   if (!::tt::target::VerifySizePrefixedSystemDescRootBuffer(verifier)) {
-    throw std::runtime_error("Failed to verify system desc root buffer");
+    LOG_FATAL("Failed to verify system desc root buffer");
   }
   uint8_t *buf = fbb.GetBufferPointer();
   auto size = fbb.GetSize();
diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp
index 586b8394ea..591133acee 100644
--- a/runtime/lib/runtime.cpp
+++ b/runtime/lib/runtime.cpp
@@ -42,7 +42,7 @@ void deallocateBuffers(Device device) {
     return ::tt::runtime::ttmetal::deallocateBuffers(device);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 } // namespace detail
 
@@ -91,15 +91,14 @@ void setCompatibleRuntime(const Binary &binary) {
     return setCurrentRuntime(DeviceRuntime::TTMetal);
   }
 #endif
-  throw std::runtime_error(
-      "Unsupported binary file identifier or runtime not enabled");
+  LOG_FATAL("Unsupported binary file identifier or runtime not enabled");
 }
 
 std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc() {
 #if defined(TT_RUNTIME_ENABLE_TTNN) || defined(TT_RUNTIME_ENABLE_TTMETAL)
   return system_desc::getCurrentSystemDesc();
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 Tensor createTensor(std::shared_ptr<void> data,
@@ -122,7 +121,7 @@ Tensor createTensor(std::shared_ptr<void> data,
                                                 dataType);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 Tensor
@@ -143,10 +142,10 @@ createTensor(std::vector<std::shared_ptr<void>> &data,
 
 #if defined(TT_RUNTIME_ENABLE_TTMETAL)
   if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
-    throw std::runtime_error("Not implemented");
+    LOG_FATAL("Not implemented");
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 tt::target::DataType getTensorDataType(Tensor tensor) {
@@ -161,7 +160,7 @@ tt::target::DataType getTensorDataType(Tensor tensor) {
     return ::tt::runtime::ttmetal::getTensorDataType(tensor);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 size_t getNumAvailableDevices() {
@@ -176,7 +175,7 @@ size_t getNumAvailableDevices() {
     return ::tt::runtime::ttmetal::getNumAvailableDevices();
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 Device openDevice(DeviceIds const &deviceIds, size_t numHWCQs) {
@@ -191,7 +190,7 @@ Device openDevice(DeviceIds const &deviceIds, size_t numHWCQs) {
     return ::tt::runtime::ttmetal::openDevice(deviceIds, numHWCQs);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 void closeDevice(Device device) {
@@ -206,44 +205,100 @@ void closeDevice(Device device) {
     return ::tt::runtime::ttmetal::closeDevice(device);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
-Event submit(Device deviceHandle, Binary executableHandle,
-             std::uint32_t programIndex,
-             std::vector<Tensor> const &inputHandles,
-             std::vector<Tensor> const &outputHandles) {
+void wait(Event event) {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   if (getCurrentRuntime() == DeviceRuntime::TTNN) {
-    return ::tt::runtime::ttnn::submit(deviceHandle, executableHandle,
-                                       programIndex, inputHandles,
-                                       outputHandles);
+    LOG_WARNING("wait API will be deprecated for TTNN runtime.");
+    return ::tt::runtime::ttnn::legacy::wait(event);
   }
 #endif
 
 #if defined(TT_RUNTIME_ENABLE_TTMETAL)
   if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
-    return ::tt::runtime::ttmetal::submit(deviceHandle, executableHandle,
-                                          programIndex, inputHandles,
-                                          outputHandles);
+    return ::tt::runtime::ttmetal::wait(event);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
-void wait(Event event) {
+Tensor toHost(Tensor tensor, bool untilize) {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   if (getCurrentRuntime() == DeviceRuntime::TTNN) {
-    return ::tt::runtime::ttnn::wait(event);
+    return ::tt::runtime::ttnn::toHost(tensor, untilize);
   }
 #endif
 
 #if defined(TT_RUNTIME_ENABLE_TTMETAL)
   if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
-    return ::tt::runtime::ttmetal::wait(event);
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+Tensor toDevice(Tensor tensor, Device device) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::toDevice(tensor, device);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
+}
+
+Tensor toDevice(Tensor tensor, Device device, Layout layout) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::toDevice(tensor, device, layout);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+Tensor toLayout(Tensor tensor, Layout layout) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::toLayout(tensor, layout);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
+                 std::uint32_t inputIndex) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::getLayout(executableHandle, programIndex,
+                                          inputIndex);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
 }
 
 std::string getOpDebugString(OpContext opContextHandle) {
@@ -258,7 +313,7 @@ std::string getOpDebugString(OpContext opContextHandle) {
     return ::tt::runtime::ttmetal::getOpDebugString(opContextHandle);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 Tensor getOpOutputTensor(OpContext opContextHandle,
@@ -276,7 +331,7 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
                                                      programContextHandle);
   }
 #endif
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
 }
 
 std::vector<float> getTensorData(Tensor tensor) {
@@ -292,7 +347,93 @@ std::vector<float> getTensorData(Tensor tensor) {
   }
 #endif
 
-  throw std::runtime_error("runtime is not enabled");
+  LOG_FATAL("runtime is not enabled");
+}
+
+void memcpy(void *dst, Tensor src) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::memcpy(dst, src);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+void memcpy(Tensor dst, Tensor src) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::memcpy(dst, src);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+void deallocateTensor(Tensor &tensor, bool force) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::deallocateTensor(tensor, force);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
 }
 
+std::vector<Tensor> submit(Device deviceHandle, Binary executableHandle,
+                           std::uint32_t programIndex,
+                           std::vector<Tensor> const &inputHandles) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::submit(deviceHandle, executableHandle,
+                                       programIndex, inputHandles);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    LOG_FATAL("not implemented");
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
+
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex,
+             std::vector<Tensor> const &inputHandles,
+             std::vector<Tensor> const &outputHandles) {
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    LOG_WARNING("This submit API will soon be deprecated. Please switch to the "
+                "new API.");
+    return ::tt::runtime::ttnn::legacy::submit(deviceHandle, executableHandle,
+                                               programIndex, inputHandles,
+                                               outputHandles);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    return ::tt::runtime::ttmetal::submit(deviceHandle, executableHandle,
+                                          programIndex, inputHandles,
+                                          outputHandles);
+  }
+#endif
+  LOG_FATAL("runtime is not enabled");
+}
 } // namespace tt::runtime
diff --git a/runtime/lib/ttmetal/command_queue.cpp b/runtime/lib/ttmetal/command_queue.cpp
index 9a408a66b1..3480458e6a 100644
--- a/runtime/lib/ttmetal/command_queue.cpp
+++ b/runtime/lib/ttmetal/command_queue.cpp
@@ -137,7 +137,7 @@ void CQExecutor::execute(::tt::target::metal::Command const *command) {
     break;
   }
   default:
-    throw std::runtime_error("Unsupported command type");
+    LOG_FATAL("Unsupported command type");
     break;
   }
 }
@@ -328,7 +328,7 @@ createKernelConfig(::tt::target::metal::KernelSource const *kernelSource) {
     break;
   }
   }
-  throw std::runtime_error("Unsupported kernel source type");
+  LOG_FATAL("Unsupported kernel source type");
 }
 
 static ::tt::DataFormat toDataFormat(::tt::target::DataType dataType) {
@@ -346,7 +346,7 @@ static ::tt::DataFormat toDataFormat(::tt::target::DataType dataType) {
   case ::tt::target::DataType::UInt8:
     return ::tt::DataFormat::UInt8;
   default:
-    throw std::runtime_error("Unsupported data type");
+    LOG_FATAL("Unsupported data type");
   }
 }
 
@@ -358,7 +358,7 @@ static CoreType toCoreType(::tt::target::metal::CoreType coreType) {
   case ::tt::target::metal::CoreType::ETH:
     return CoreType::ETH;
   }
-  throw std::runtime_error("Unsupported core type");
+  LOG_FATAL("Unsupported core type");
 }
 
 static ::tt::tt_metal::CircularBufferConfig createCircularBufferConfig(
@@ -427,7 +427,7 @@ static void processRuntimeArgs(
       break;
     }
     case ::tt::target::metal::RuntimeArg::NONE:
-      throw std::runtime_error("Unsupported runtime arg type");
+      LOG_FATAL("Unsupported runtime arg type");
     }
   }
 
@@ -516,7 +516,7 @@ void CQExecutor::execute(
     break;
   }
   default:
-    throw std::runtime_error("Unsupported HostBuffer type");
+    LOG_FATAL("Unsupported HostBuffer type");
   }
 }
 
@@ -524,7 +524,7 @@ void CQExecutor::execute(
     ::tt::target::metal::EnqueueReadBufferCommand const *command) {
   ZoneScopedN("EnqueueReadBufferCommand");
   // Maybe we will need this in the future, like paging to system mem?
-  throw std::runtime_error("Unsupported EnqueueReadBufferCommand");
+  LOG_FATAL("Unsupported EnqueueReadBufferCommand");
 }
 
 void CQExecutor::execute(
diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp
index ab343554ed..a695e28075 100644
--- a/runtime/lib/ttmetal/runtime.cpp
+++ b/runtime/lib/ttmetal/runtime.cpp
@@ -24,7 +24,7 @@ static ::tt::target::metal::TTMetalBinary const *getBinary(Flatbuffer binary) {
       ::tt::target::metal::SizePrefixedTTMetalBinaryBufferHasIdentifier(
           binary.handle.get());
   if (not isTTMetal) {
-    throw std::runtime_error("Unsupported binary format");
+    LOG_FATAL("Unsupported binary format");
   }
   return ::tt::target::metal::GetSizePrefixedTTMetalBinary(binary.handle.get());
 }
@@ -56,7 +56,7 @@ tt::target::DataType getTensorDataType(Tensor tensor) {
   }
   if (std::holds_alternative<std::shared_ptr<::tt::tt_metal::Buffer>>(
           metalTensor)) {
-    throw std::runtime_error("Datatype mapping from buffer not supported yet.");
+    LOG_FATAL("Datatype mapping from buffer not supported yet.");
   }
   LOG_ASSERT(false, "Unsupported tensor type");
   return ::tt::target::DataType::Float32;
@@ -117,7 +117,7 @@ prepareInput(::tt::tt_metal::Device *device, MetalTensor const &metalTensor,
           metalTensor)) {
     std::shared_ptr<::tt::tt_metal::Buffer> buffer =
         std::get<std::shared_ptr<::tt::tt_metal::Buffer>>(metalTensor);
-    throw std::runtime_error("Input from buffer not supported yet");
+    LOG_FATAL("Input from buffer not supported yet");
   }
   LOG_ASSERT(false, "Unsupported tensor type");
   return std::make_pair(nullptr, nullptr);
diff --git a/runtime/lib/ttnn/CMakeLists.txt b/runtime/lib/ttnn/CMakeLists.txt
index 92581cf46f..c50701d992 100644
--- a/runtime/lib/ttnn/CMakeLists.txt
+++ b/runtime/lib/ttnn/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(include)
 add_subdirectory(operations)
 add_library(TTRuntimeTTNN
   STATIC
@@ -11,5 +12,5 @@ target_include_directories(TTRuntimeTTNN PUBLIC
   ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
 )
 target_include_directories(TTRuntimeTTNN SYSTEM PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
-target_link_libraries(TTRuntimeTTNN PUBLIC TTRuntimeTTNNOps)
+target_link_libraries(TTRuntimeTTNN PUBLIC TTRuntimeTTNNOps TTRuntimeTTNNHelpers)
 add_dependencies(TTRuntimeTTNN TTRuntimeTTNNOps)
diff --git a/runtime/lib/ttnn/include/CMakeLists.txt b/runtime/lib/ttnn/include/CMakeLists.txt
new file mode 100644
index 0000000000..a5cbd0e08f
--- /dev/null
+++ b/runtime/lib/ttnn/include/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_library(TTRuntimeTTNNHelpers
+  STATIC
+  tt/runtime/ttnn/utils.cpp
+  tt/runtime/ttnn/types.cpp
+)
+set_property(TARGET TTRuntimeTTNNHelpers PROPERTY CXX_STANDARD 20)
+target_compile_options(TTRuntimeTTNNHelpers PUBLIC -mavx -mavx2 -fsized-deallocation)
+target_include_directories(TTRuntimeTTNNHelpers PUBLIC
+  ${PROJECT_SOURCE_DIR}/runtime/include
+  ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/include
+  ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/operations/include
+  ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
+)
+target_include_directories(TTRuntimeTTNNHelpers SYSTEM PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
+add_dependencies(TTRuntimeTTNNHelpers TTNN_LIBRARY tt-metal FBS_GENERATION)
+target_link_libraries(TTRuntimeTTNNHelpers PUBLIC TTNN_LIBRARY)
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp
new file mode 100644
index 0000000000..bac739c140
--- /dev/null
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.cpp
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt/runtime/ttnn/types.h"
+#include "tt/runtime/detail/logger.h"
+#include "tt/runtime/ttnn/utils.h"
+
+namespace tt::runtime::ttnn {
+
+//
+// ProgramTensorPool APIs
+//
+std::pair<std::unordered_map<std::uint32_t, ::ttnn::Tensor *>::iterator, bool>
+ProgramTensorPool::try_emplace(std::uint32_t globalId,
+                               const ::ttnn::Tensor &tensor) {
+  auto it = liveTensors.find(globalId);
+  if (it != liveTensors.end()) {
+    return std::make_pair(it, false);
+  }
+  LOG_ASSERT(!intermedTensors.contains(globalId));
+  intermedTensors.try_emplace(globalId, tensor);
+  return liveTensors.try_emplace(globalId, &intermedTensors.at(globalId));
+}
+
+std::pair<std::unordered_map<std::uint32_t, ::ttnn::Tensor *>::iterator, bool>
+ProgramTensorPool::insert_or_assign(std::uint32_t globalId,
+                                    const ::ttnn::Tensor &tensor) {
+  intermedTensors.insert_or_assign(globalId, tensor);
+  return liveTensors.insert_or_assign(globalId, &intermedTensors.at(globalId));
+}
+
+::ttnn::Tensor &ProgramTensorPool::at(std::uint32_t globalId) {
+  LOG_ASSERT(liveTensors.contains(globalId));
+  return *liveTensors.at(globalId);
+}
+
+const ::ttnn::Tensor &ProgramTensorPool::at(std::uint32_t globalId) const {
+  LOG_ASSERT(liveTensors.contains(globalId));
+  return *liveTensors.at(globalId);
+}
+
+size_t ProgramTensorPool::erase(std::uint32_t globalId) {
+  LOG_ASSERT(liveTensors.contains(globalId) &&
+             intermedTensors.contains(globalId));
+  intermedTensors.erase(globalId);
+  return liveTensors.erase(globalId);
+}
+
+std::vector<Tensor> ProgramTensorPool::gatherOutputTensors() {
+  std::vector<Tensor> outputTensors;
+  outputTensors.reserve(programOutputs.size());
+  std::transform(
+      programOutputs.begin(), programOutputs.end(),
+      std::back_inserter(outputTensors), [this](uint32_t outputGlobalId) {
+        return utils::createRuntimeTensorFromTTNN(this->at(outputGlobalId));
+      });
+  return outputTensors;
+}
+
+//
+// ProgramContext APIs
+//
+ProgramContext::ProgramContext(
+    const std::unordered_map<uint32_t, ::ttnn::Tensor *> &liveTensors,
+    const std::vector<uint32_t> &programInputs,
+    const std::vector<uint32_t> &programOutputs, ::ttnn::MeshDevice *parentMesh)
+    : tensorPool(ProgramTensorPool(liveTensors, programInputs, programOutputs)),
+      parentMesh(parentMesh) {
+  LOG_ASSERT(parentMesh, "Parent mesh cannot be null");
+}
+
+void ProgramContext::addSubMesh(uint32_t meshId,
+                                std::shared_ptr<::ttnn::MeshDevice> subMesh) {
+  auto [it, inserted] = subMeshes.try_emplace(meshId, subMesh);
+  LOG_ASSERT(inserted, "Submesh already exists");
+}
+
+::ttnn::MeshDevice &ProgramContext::getSubMesh(uint32_t meshId) {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  return *subMeshes.at(meshId);
+}
+
+size_t ProgramContext::subMeshSize(uint32_t meshId) const {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  return subMeshes.at(meshId)->num_devices();
+}
+
+::ttnn::Device &ProgramContext::getDeviceFromSubMesh(uint32_t meshId,
+                                                     int physicalDeviceId) {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  auto &subMesh = *subMeshes.at(meshId);
+  return *subMesh.get_device(physicalDeviceId);
+}
+
+::ttnn::Device &ProgramContext::getDeviceIndexFromSubMesh(uint32_t meshId,
+                                                          int deviceIndex) {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  auto &subMesh = *subMeshes.at(meshId);
+  return *subMesh.get_device_index(deviceIndex);
+}
+
+DeviceVariant ProgramContext::getTargetDevice(uint32_t meshId) {
+  LOG_ASSERT(subMeshes.contains(meshId));
+  auto &subMesh = *subMeshes.at(meshId);
+  if (subMesh.num_devices() == 1) {
+    return std::ref(*subMesh.get_device_index(0));
+  }
+  return std::ref(subMesh);
+}
+
+} // namespace tt::runtime::ttnn
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h
index 5cd08c7ed0..49ae3cda33 100644
--- a/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/types.h
@@ -6,18 +6,37 @@
 #define TT_RUNTIME_TTNN_TYPES_H
 
 #include "tt/runtime/detail/ttnn.h"
+#include <optional>
+#include <unordered_map>
 
 namespace tt::runtime::ttnn {
-
-using TensorMap = std::unordered_map<uint32_t, ::ttnn::Tensor *>;
 using DeviceVariant = std::variant<std::reference_wrapper<::ttnn::Device>,
                                    std::reference_wrapper<::ttnn::MeshDevice>>;
 
+struct LayoutDesc {
+  ::ttnn::BufferType bufferType;
+  ::ttnn::Layout layout;
+  ::ttnn::DataType dataType;
+  std::optional<::ttnn::MemoryConfig> memoryConfig;
+
+  LayoutDesc(const ::ttnn::BufferType &bufferType, const ::ttnn::Layout &layout,
+             const ::ttnn::DataType &dataType,
+             const std::optional<::ttnn::MemoryConfig> &memoryConfig)
+      : bufferType(bufferType), layout(layout), dataType(dataType),
+        memoryConfig(memoryConfig) {}
+
+  bool isOnHost() const {
+    return bufferType == ::ttnn::BufferType::SYSTEM_MEMORY;
+  }
+  bool isOnDevice() const { return !isOnHost(); }
+};
+
 class ProgramTensorPool {
 public:
-  ProgramTensorPool(const TensorMap &liveTensors,
-                    const std::unordered_set<uint32_t> &programInputs,
-                    const std::unordered_set<uint32_t> &programOutputs)
+  ProgramTensorPool(
+      const std::unordered_map<uint32_t, ::ttnn::Tensor *> &liveTensors,
+      const std::vector<uint32_t> &programInputs,
+      const std::vector<uint32_t> &programOutputs)
       : programInputs(programInputs), programOutputs(programOutputs),
         liveTensors(liveTensors) {}
   ProgramTensorPool(const ProgramTensorPool &) = delete;
@@ -25,72 +44,38 @@ class ProgramTensorPool {
   ProgramTensorPool(ProgramTensorPool &&) = default;
   ProgramTensorPool &operator=(ProgramTensorPool &&) = default;
 
-  auto try_emplace(std::uint32_t globalId, const ::ttnn::Tensor &tensor) {
-    auto it = liveTensors.find(globalId);
-    if (it != liveTensors.end()) {
-      return std::make_pair(it, false);
-    }
-    assert(!intermedTensors.contains(globalId));
-    intermedTensors.try_emplace(globalId, tensor);
-    return liveTensors.try_emplace(globalId, &intermedTensors.at(globalId));
-  }
+  std::pair<std::unordered_map<std::uint32_t, ::ttnn::Tensor *>::iterator, bool>
+  try_emplace(std::uint32_t globalId, const ::ttnn::Tensor &tensor);
 
-  auto insert_or_assign(std::uint32_t globalId, const ::ttnn::Tensor &tensor) {
-    intermedTensors.insert_or_assign(globalId, tensor);
-    return liveTensors.insert_or_assign(globalId,
-                                        &intermedTensors.at(globalId));
-  }
+  std::pair<std::unordered_map<std::uint32_t, ::ttnn::Tensor *>::iterator, bool>
+  insert_or_assign(std::uint32_t globalId, const ::ttnn::Tensor &tensor);
 
-  ::ttnn::Tensor &at(std::uint32_t globalId) {
-    assert(liveTensors.contains(globalId));
-    return *liveTensors.at(globalId);
-  }
+  ::ttnn::Tensor &at(std::uint32_t globalId);
 
-  const ::ttnn::Tensor &at(std::uint32_t globalId) const {
-    assert(liveTensors.contains(globalId));
-    return *liveTensors.at(globalId);
-  }
+  const ::ttnn::Tensor &at(std::uint32_t globalId) const;
 
-  size_t erase(std::uint32_t globalId) {
-    assert(liveTensors.contains(globalId) &&
-           intermedTensors.contains(globalId));
-    intermedTensors.erase(globalId);
-    return liveTensors.erase(globalId);
-  }
+  size_t erase(std::uint32_t globalId);
 
-  void copyTensorToUserOutput(std::uint32_t outputGlobalId,
-                              const ::ttnn::Tensor &srcTensor) {
-    assert(liveTensors.contains(outputGlobalId));
-    assert(isUserOutput(outputGlobalId));
-    ::ttnn::Tensor &outputTensor = *liveTensors.at(outputGlobalId);
-    void *src = ::tt::tt_metal::get_raw_host_data_ptr(srcTensor);
-    void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor);
-    size_t size = outputTensor.volume() * outputTensor.element_size();
-    std::memcpy(dst, src, size);
-  }
+  std::vector<Tensor> gatherOutputTensors();
 
   bool contains(std::uint32_t globalId) const {
     return liveTensors.contains(globalId);
   }
 
-  bool isUserOutput(std::uint32_t globalId) const {
-    return programOutputs.contains(globalId);
-  }
-
-  const std::unordered_set<std::uint32_t> &getProgramInputs() const {
+  const std::vector<std::uint32_t> &getProgramInputs() const {
     return programInputs;
   }
 
-  const std::unordered_set<std::uint32_t> &getProgramOutputs() const {
+  const std::vector<std::uint32_t> &getProgramOutputs() const {
     return programOutputs;
   }
 
 private:
-  std::unordered_set<std::uint32_t> programInputs;
-  std::unordered_set<std::uint32_t> programOutputs;
+  std::vector<std::uint32_t> programInputs;
+  std::vector<std::uint32_t> programOutputs;
   // A superset of intermedTensors, containing pointers to all tensors created
-  // by the program and the input/output tensors passed in by the user
-  TensorMap liveTensors;
+  // by the program and the input tensors passed in by the user
+  std::unordered_map<uint32_t, ::ttnn::Tensor *> liveTensors;
 
   // A subset of liveTensors, containing values of any intermediate tensors
   // created by the program
@@ -99,15 +84,11 @@ class ProgramTensorPool {
 
 class ProgramContext {
 public:
-  ProgramContext(const TensorMap &liveTensors,
-                 const std::unordered_set<uint32_t> &programInputs,
-                 const std::unordered_set<uint32_t> &programOutputs,
-                 ::ttnn::MeshDevice *parentMesh)
-      : tensorPool(
-            ProgramTensorPool(liveTensors, programInputs, programOutputs)),
-        parentMesh(parentMesh) {
-    assert(parentMesh && "Parent mesh cannot be null");
-  }
+  ProgramContext(
+      const std::unordered_map<uint32_t, ::ttnn::Tensor *> &liveTensors,
+      const std::vector<uint32_t> &programInputs,
+      const std::vector<uint32_t> &programOutputs,
+      ::ttnn::MeshDevice *parentMesh);
   ProgramContext(const ProgramContext &) = delete;
   ProgramContext &operator=(const ProgramContext &) = delete;
   ProgramContext(ProgramContext &&) = default;
@@ -125,42 +106,17 @@ class ProgramContext {
   //
   // Sub Mesh Operations
   //
-  void addSubMesh(uint32_t meshId,
-                  std::shared_ptr<::ttnn::MeshDevice> subMesh) {
-    auto [it, inserted] = subMeshes.try_emplace(meshId, subMesh);
-    assert(inserted && "Submesh already exists");
-  }
+  void addSubMesh(uint32_t meshId, std::shared_ptr<::ttnn::MeshDevice> subMesh);
 
-  ::ttnn::MeshDevice &getSubMesh(uint32_t meshId) {
-    assert(subMeshes.contains(meshId));
-    return *subMeshes.at(meshId);
-  }
+  ::ttnn::MeshDevice &getSubMesh(uint32_t meshId);
 
-  size_t subMeshSize(uint32_t meshId) const {
-    assert(subMeshes.contains(meshId));
-    return subMeshes.at(meshId)->num_devices();
-  }
+  size_t subMeshSize(uint32_t meshId) const;
 
-  ::ttnn::Device &getDeviceFromSubMesh(uint32_t meshId, int physicalDeviceId) {
-    assert(subMeshes.contains(meshId));
-    auto &subMesh = *subMeshes.at(meshId);
-    return *subMesh.get_device(physicalDeviceId);
-  }
+  ::ttnn::Device &getDeviceFromSubMesh(uint32_t meshId, int physicalDeviceId);
 
-  ::ttnn::Device &getDeviceIndexFromSubMesh(uint32_t meshId, int deviceIndex) {
-    assert(subMeshes.contains(meshId));
-    auto &subMesh = *subMeshes.at(meshId);
-    return *subMesh.get_device_index(deviceIndex);
-  }
+  ::ttnn::Device &getDeviceIndexFromSubMesh(uint32_t meshId, int deviceIndex);
 
-  DeviceVariant getTargetDevice(uint32_t meshId) {
-    assert(subMeshes.contains(meshId));
-    auto &subMesh = *subMeshes.at(meshId);
-    if (subMesh.num_devices() == 1) {
-      return std::ref(*subMesh.get_device_index(0));
-    }
-    return std::ref(subMesh);
-  }
+  DeviceVariant getTargetDevice(uint32_t meshId);
 
   //
   // Tensor Pool Operations
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp
new file mode 100644
index 0000000000..914c80cf38
--- /dev/null
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.cpp
@@ -0,0 +1,216 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt/runtime/ttnn/utils.h"
+#include "tt/runtime/detail/logger.h"
+
+namespace tt::runtime::ttnn::utils {
+
+// TODO (bug #701)
+// Currently the memory layout/location in flatbuffer is incorrect
+// These methods are workarounds for operations such that we query the info
+// directly from the TTNN tensor. Ideally, we should be able to get all of this
+// info directly from the flatbuffer
+bool isOnHost(const ::ttnn::StorageType &storageType) {
+  return storageType == ::tt::tt_metal::StorageType::BORROWED or
+         storageType == ::tt::tt_metal::StorageType::OWNED or
+         storageType == ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST;
+}
+
+bool isOnDevice(const ::ttnn::StorageType &storageType) {
+  return storageType == ::tt::tt_metal::StorageType::DEVICE or
+         storageType == ::tt::tt_metal::StorageType::MULTI_DEVICE;
+}
+
+bool isValidTileShape(const ::tt::target::Dim2d *shape) {
+  return (shape->x() == 1 and shape->y() == 1) or
+         (shape->x() == 32 and shape->y() == 32);
+}
+
+::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) {
+  switch (dataType) {
+  case ::tt::target::DataType::Float32:
+    return ::ttnn::DataType::FLOAT32;
+  case ::tt::target::DataType::BFloat16:
+    return ::ttnn::DataType::BFLOAT16;
+  case ::tt::target::DataType::BFP_BFloat8:
+    return ::ttnn::DataType::BFLOAT8_B;
+  case ::tt::target::DataType::BFP_BFloat4:
+    return ::ttnn::DataType::BFLOAT4_B;
+  case ::tt::target::DataType::UInt32:
+    return ::ttnn::DataType::UINT32;
+  case ::tt::target::DataType::UInt16:
+    return ::ttnn::DataType::UINT16;
+
+  default:
+    LOG_FATAL("Unsupported data type");
+  }
+}
+
+::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType) {
+  switch (dataType) {
+  case ::ttnn::DataType::FLOAT32:
+    return ::tt::target::DataType::Float32;
+  case ::ttnn::DataType::BFLOAT16:
+    return ::tt::target::DataType::BFloat16;
+  case ::ttnn::DataType::BFLOAT8_B:
+    return ::tt::target::DataType::BFP_BFloat8;
+  case ::ttnn::DataType::BFLOAT4_B:
+    return ::tt::target::DataType::BFP_BFloat4;
+  case ::ttnn::DataType::UINT32:
+    return ::tt::target::DataType::UInt32;
+  case ::ttnn::DataType::UINT16:
+    return ::tt::target::DataType::UInt16;
+
+  default:
+    LOG_FATAL("Unsupported data type");
+  }
+}
+
+::ttnn::Layout toTTNNLayout(::tt::target::TensorLayout layout) {
+  switch (layout) {
+  case ::tt::target::TensorLayout::Tile:
+    return ::ttnn::Layout::TILE;
+  case ::tt::target::TensorLayout::RowMajor:
+    return ::ttnn::Layout::ROW_MAJOR;
+  default:
+    LOG_FATAL("Unsupported layout");
+  }
+}
+
+::ttnn::TensorMemoryLayout
+toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout tensorMemoryLayout) {
+
+  switch (tensorMemoryLayout) {
+  case ::tt::target::TensorMemoryLayout::Interleaved:
+    return ::ttnn::TensorMemoryLayout::INTERLEAVED;
+  case ::tt::target::TensorMemoryLayout::SingleBank:
+    return ::ttnn::TensorMemoryLayout::SINGLE_BANK;
+  case ::tt::target::TensorMemoryLayout::HeightSharded:
+    return ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED;
+  case ::tt::target::TensorMemoryLayout::WidthSharded:
+    return ::ttnn::TensorMemoryLayout::WIDTH_SHARDED;
+  case ::tt::target::TensorMemoryLayout::BlockSharded:
+    return ::ttnn::TensorMemoryLayout::BLOCK_SHARDED;
+  case ::tt::target::TensorMemoryLayout::None:
+    LOG_FATAL("Unsupported tensor memory layout None");
+  }
+}
+
+// This method will be deprecated in favor of method below
+//
+::tt::tt_metal::BufferType
+toTTNNBufferType(::tt::target::MemorySpace memorySpace) {
+  switch (memorySpace) {
+  case ::tt::target::MemorySpace::System:
+  case ::tt::target::MemorySpace::SystemMMIO:
+    return ::tt::tt_metal::BufferType::SYSTEM_MEMORY;
+  case ::tt::target::MemorySpace::DeviceDRAM:
+    return ::tt::tt_metal::BufferType::DRAM;
+  case ::tt::target::MemorySpace::DeviceL1:
+    return ::tt::tt_metal::BufferType::L1;
+  }
+}
+
+// Prefer to use this method
+//
+::ttnn::BufferType toTTNNBufferType(::tt::target::BufferType bufferType) {
+
+  switch (bufferType) {
+  case ::tt::target::BufferType::DRAM:
+    return ::ttnn::BufferType::DRAM;
+  case ::tt::target::BufferType::L1:
+    return ::ttnn::BufferType::L1;
+  case ::tt::target::BufferType::SystemMemory:
+    return ::ttnn::BufferType::SYSTEM_MEMORY;
+  case ::tt::target::BufferType::L1Small:
+    return ::ttnn::BufferType::L1_SMALL;
+  case ::tt::target::BufferType::Trace:
+    return ::ttnn::BufferType::TRACE;
+  }
+};
+
+std::vector<uint32_t>
+toShapeFromFBShape(const flatbuffers::Vector<int32_t> &vec) {
+  return std::vector<uint32_t>(vec.begin(), vec.end());
+}
+
+::ttnn::Layout
+inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef) {
+  const ::tt::target::Dim2d *tileShape =
+      tensorRef->desc()->layout()->memory_desc()->tile_shape();
+  LOG_ASSERT(isValidTileShape(tileShape));
+  if (tileShape->x() == 1 and tileShape->y() == 1) {
+    return ::ttnn::Layout::ROW_MAJOR;
+  }
+  return ::ttnn::Layout::TILE;
+}
+
+CoreRangeSet
+toCoreRangeSet(const ::flatbuffers::Vector<const ::tt::target::Dim2dRange *>
+                   *coreRangeSet) {
+  std::set<CoreRange> coreRanges;
+  for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) {
+    CoreCoord start(coreRange->loc().x(), coreRange->loc().y());
+    // End is inclusive
+    CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1,
+                  coreRange->loc().y() + coreRange->size().y() - 1);
+
+    coreRanges.emplace(start, end);
+  }
+  return CoreRangeSet(coreRanges);
+}
+
+::tt::tt_metal::MemoryConfig
+createMemoryConfig(const ::tt::target::TensorRef *tensorRef) {
+  const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout();
+  const ::tt::target::TensorMemoryLayout targetMemoryLayout =
+      layout->memory_desc()->memory_layout();
+  const ::tt::target::MemorySpace targetMemorySpace =
+      layout->memory_desc()->memory_space();
+  const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
+      *targetCoreRangeSet = layout->core_range_set();
+  const ::flatbuffers::Vector<int32_t> *targetShardShape =
+      layout->memory_desc()->shape();
+  const ::tt::target::Dim2d *tileShape = layout->memory_desc()->tile_shape();
+
+  LOG_ASSERT(targetCoreRangeSet->size() == 1,
+             "Currently only single core range/grid is supported");
+
+  LOG_ASSERT(targetShardShape->size() == 2,
+             "Only 2D shard shape is supported in TTNN backend");
+
+  LOG_ASSERT(::tt::runtime::ttnn::utils::isValidTileShape(tileShape),
+             "Invalid tile shape");
+
+  CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet);
+  std::array<uint32_t, 2> ttnnShardShape;
+  std::copy(targetShardShape->begin(), targetShardShape->end(),
+            ttnnShardShape.begin());
+
+  ttnnShardShape[0] *= tileShape->y();
+  ttnnShardShape[1] *= tileShape->x();
+
+  ::tt::tt_metal::ShardSpec shardSpec(
+      ttnnCoreRangeSet, ttnnShardShape,
+      ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false);
+
+  ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout =
+      toTTNNTensorMemoryLayout(targetMemoryLayout);
+
+  ::tt::tt_metal::BufferType ttnnBufferType =
+      toTTNNBufferType(targetMemorySpace);
+  ::tt::tt_metal::MemoryConfig memoryConfig{.memory_layout = ttnnMemLayout,
+                                            .buffer_type = ttnnBufferType,
+                                            .shard_spec = shardSpec};
+  return memoryConfig;
+}
+
+Tensor createRuntimeTensorFromTTNN(const ::ttnn::Tensor &tensor) {
+  auto tensorPtr = std::make_shared<::ttnn::Tensor>(tensor);
+  return Tensor(std::static_pointer_cast<void>(tensorPtr), nullptr,
+                DeviceRuntime::TTNN);
+}
+
+} // namespace tt::runtime::ttnn::utils
diff --git a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
index ca50ad58b3..353195b8df 100644
--- a/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
+++ b/runtime/lib/ttnn/include/tt/runtime/ttnn/utils.h
@@ -6,126 +6,50 @@
 #define TT_RUNTIME_TTNN_UTILS_H
 
 #include "flatbuffers/vector.h"
+#include "tt/runtime/detail/ttnn.h"
 #include "ttmlir/Target/Common/types_generated.h"
 #include "ttmlir/Target/TTNN/Target.h"
-#include "ttnn/types.hpp"
 
 namespace tt::runtime::ttnn::utils {
 
-inline bool isValidTileShape(const ::tt::target::Dim2d *shape) {
-  return (shape->x() == 1 and shape->y() == 1) or
-         (shape->x() == 32 and shape->y() == 32);
-}
-
-inline ::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) {
-  switch (dataType) {
-  case ::tt::target::DataType::Float32:
-    return ::ttnn::DataType::FLOAT32;
-  case ::tt::target::DataType::BFloat16:
-    return ::ttnn::DataType::BFLOAT16;
-  case ::tt::target::DataType::BFP_BFloat8:
-    return ::ttnn::DataType::BFLOAT8_B;
-  case ::tt::target::DataType::BFP_BFloat4:
-    return ::ttnn::DataType::BFLOAT4_B;
-  case ::tt::target::DataType::UInt32:
-    return ::ttnn::DataType::UINT32;
-  case ::tt::target::DataType::UInt16:
-    return ::ttnn::DataType::UINT16;
-
-  default:
-    throw std::runtime_error("Unsupported data type");
-  }
-}
-
-inline ::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType) {
-  switch (dataType) {
-  case ::ttnn::DataType::FLOAT32:
-    return ::tt::target::DataType::Float32;
-  case ::ttnn::DataType::BFLOAT16:
-    return ::tt::target::DataType::BFloat16;
-  case ::ttnn::DataType::BFLOAT8_B:
-    return ::tt::target::DataType::BFP_BFloat8;
-  case ::ttnn::DataType::BFLOAT4_B:
-    return ::tt::target::DataType::BFP_BFloat4;
-  case ::ttnn::DataType::UINT32:
-    return ::tt::target::DataType::UInt32;
-  case ::ttnn::DataType::UINT16:
-    return ::tt::target::DataType::UInt16;
-
-  default:
-    throw std::runtime_error("Unsupported data type");
-  }
-}
-
-inline ::ttnn::Layout toTTNNLayout(::tt::target::TensorLayout layout) {
-  switch (layout) {
-  case ::tt::target::TensorLayout::Tile:
-    return ::ttnn::Layout::TILE;
-  case ::tt::target::TensorLayout::RowMajor:
-    return ::ttnn::Layout::ROW_MAJOR;
-  default:
-    throw std::runtime_error("Unsupported layout");
-  }
-}
-
-inline ::ttnn::TensorMemoryLayout
-toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout tensorMemoryLayout) {
-
-  switch (tensorMemoryLayout) {
-  case ::tt::target::TensorMemoryLayout::Interleaved:
-    return ::ttnn::TensorMemoryLayout::INTERLEAVED;
-  case ::tt::target::TensorMemoryLayout::SingleBank:
-    return ::ttnn::TensorMemoryLayout::SINGLE_BANK;
-  case ::tt::target::TensorMemoryLayout::HeightSharded:
-    return ::ttnn::TensorMemoryLayout::HEIGHT_SHARDED;
-  case ::tt::target::TensorMemoryLayout::WidthSharded:
-    return ::ttnn::TensorMemoryLayout::WIDTH_SHARDED;
-  case ::tt::target::TensorMemoryLayout::BlockSharded:
-    return ::ttnn::TensorMemoryLayout::BLOCK_SHARDED;
-  case ::tt::target::TensorMemoryLayout::None:
-    assert(false &&
-           "Unsupported tensor memory layout TensorMemoryLayout::None");
-  }
-}
+bool isOnHost(const ::ttnn::StorageType &storageType);
+
+bool isOnDevice(const ::ttnn::StorageType &storageType);
+
+bool isValidTileShape(const ::tt::target::Dim2d *shape);
+
+::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType);
+
+::tt::target::DataType fromTTNNDataType(::ttnn::DataType dataType);
+
+::ttnn::Layout toTTNNLayout(::tt::target::TensorLayout layout);
+
+::ttnn::TensorMemoryLayout
+toTTNNTensorMemoryLayout(::tt::target::TensorMemoryLayout tensorMemoryLayout);
 
 // This method will be deprecated in favor of method below
 //
-inline ::tt::tt_metal::BufferType
-toTTNNBufferType(::tt::target::MemorySpace memorySpace) {
-  switch (memorySpace) {
-  case ::tt::target::MemorySpace::System:
-  case ::tt::target::MemorySpace::SystemMMIO:
-    return ::tt::tt_metal::BufferType::SYSTEM_MEMORY;
-  case ::tt::target::MemorySpace::DeviceDRAM:
-    return ::tt::tt_metal::BufferType::DRAM;
-  case ::tt::target::MemorySpace::DeviceL1:
-    return ::tt::tt_metal::BufferType::L1;
-  }
-}
+::tt::tt_metal::BufferType
+toTTNNBufferType(::tt::target::MemorySpace memorySpace);
 
 // Prefer to use this method
 //
-inline ::ttnn::BufferType
-toTTNNBufferType(::tt::target::BufferType bufferType) {
-
-  switch (bufferType) {
-  case ::tt::target::BufferType::DRAM:
-    return ::ttnn::BufferType::DRAM;
-  case ::tt::target::BufferType::L1:
-    return ::ttnn::BufferType::L1;
-  case ::tt::target::BufferType::SystemMemory:
-    return ::ttnn::BufferType::SYSTEM_MEMORY;
-  case ::tt::target::BufferType::L1Small:
-    return ::ttnn::BufferType::L1_SMALL;
-  case ::tt::target::BufferType::Trace:
-    return ::ttnn::BufferType::TRACE;
-  }
-};
-
-inline std::vector<uint32_t>
-toShapeFromFBShape(const flatbuffers::Vector<int32_t> &vec) {
-  return std::vector<uint32_t>(vec.begin(), vec.end());
-}
+::ttnn::BufferType toTTNNBufferType(::tt::target::BufferType bufferType);
+
+std::vector<uint32_t>
+toShapeFromFBShape(const flatbuffers::Vector<int32_t> &vec);
+
+::ttnn::Layout
+inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef);
+
+CoreRangeSet
+toCoreRangeSet(const ::flatbuffers::Vector<const ::tt::target::Dim2dRange *>
+                   *coreRangeSet);
+
+::tt::tt_metal::MemoryConfig
+createMemoryConfig(const ::tt::target::TensorRef *tensorRef);
+
+Tensor createRuntimeTensorFromTTNN(const ::ttnn::Tensor &tensor);
 
 } // namespace tt::runtime::ttnn::utils
 
diff --git a/runtime/lib/ttnn/operations/CMakeLists.txt b/runtime/lib/ttnn/operations/CMakeLists.txt
index 4edc4780b9..df32e2c759 100644
--- a/runtime/lib/ttnn/operations/CMakeLists.txt
+++ b/runtime/lib/ttnn/operations/CMakeLists.txt
@@ -45,12 +45,13 @@ target_include_directories(TTRuntimeTTNNOps PUBLIC
   ${PROJECT_SOURCE_DIR}/runtime/lib/ttnn/operations/include
   ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common
 )
+
 target_include_directories(TTRuntimeTTNNOps SYSTEM PUBLIC "$<BUILD_INTERFACE:${TTMETAL_INCLUDE_DIRS}>")
-target_link_libraries(TTRuntimeTTNNOps PUBLIC TTNN_LIBRARY)
+target_link_libraries(TTRuntimeTTNNOps PUBLIC TTNN_LIBRARY TTRuntimeTTNNHelpers)
 
 if (TT_RUNTIME_ENABLE_PERF_TRACE)
   target_link_libraries(TTRuntimeTTNNOps PUBLIC TRACY_LIBRARY)
 endif()
 
 
-add_dependencies(TTRuntimeTTNNOps TTNN_LIBRARY tt-metal FBS_GENERATION)
+add_dependencies(TTRuntimeTTNNOps TTNN_LIBRARY tt-metal FBS_GENERATION TTRuntimeTTNNHelpers)
diff --git a/runtime/lib/ttnn/operations/ccl/all_gather.cpp b/runtime/lib/ttnn/operations/ccl/all_gather.cpp
index 37bf7427bf..eee27e7bab 100644
--- a/runtime/lib/ttnn/operations/ccl/all_gather.cpp
+++ b/runtime/lib/ttnn/operations/ccl/all_gather.cpp
@@ -5,6 +5,7 @@
 #include "all_gather.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::ccl {
 void run(const ::tt::target::ttnn::AllGatherOp *op, ProgramContext &context) {
@@ -13,7 +14,7 @@ void run(const ::tt::target::ttnn::AllGatherOp *op, ProgramContext &context) {
   int32_t dim = op->dim();
   int32_t num_links = op->num_links();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out =
       ::ttnn::all_gather(input, dim, num_links, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
diff --git a/runtime/lib/ttnn/operations/conv/conv2d.cpp b/runtime/lib/ttnn/operations/conv/conv2d.cpp
index e6670c1131..5e00b929e7 100644
--- a/runtime/lib/ttnn/operations/conv/conv2d.cpp
+++ b/runtime/lib/ttnn/operations/conv/conv2d.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "ttmlir/Target/TTNN/program_generated.h"
 #include "ttnn/types.hpp"
 
@@ -23,7 +24,8 @@ void run(const ::tt::target::ttnn::Conv2dOp *op, ProgramContext &context) {
   auto config = ::ttnn::operations::conv::conv2d::Conv2dConfig();
   config.dtype = utils::getDataType(op->input());
   config.weights_dtype = utils::getDataType(op->weight());
-  ::ttnn::MemoryConfig outMemConfig = utils::createMemoryConfig(op->out());
+  ::ttnn::MemoryConfig outMemConfig =
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   DeviceVariant targetDevice =
       context.getTargetDevice(op->device()->global_id());
   ::ttnn::Tensor out = std::visit(
diff --git a/runtime/lib/ttnn/operations/creation/empty.cpp b/runtime/lib/ttnn/operations/creation/empty.cpp
index bed68e0f14..d504a798b2 100644
--- a/runtime/lib/ttnn/operations/creation/empty.cpp
+++ b/runtime/lib/ttnn/operations/creation/empty.cpp
@@ -62,11 +62,12 @@ createEmptyOnMultiDevice(ProgramContext &context, EmptyTensorConfig &config,
   ::tt::tt_metal::DistributedTensorConfig strategy =
       config.distributedTensorConfig();
   std::vector<::ttnn::Tensor> tensorShards;
-  tensorShards.resize(config.numShards);
-  std::generate_n(
-      tensorShards.begin(), config.numShards, [&config]() -> ::ttnn::Tensor {
-        return ::ttnn::zeros(config.shape, config.dtype, config.layout);
-      });
+  tensorShards.reserve(config.numShards);
+  std::generate_n(std::back_inserter(tensorShards), config.numShards,
+                  [&config]() -> ::ttnn::Tensor {
+                    return ::ttnn::zeros(config.shape, config.dtype,
+                                         config.layout);
+                  });
   ::ttnn::Tensor out = ::ttnn::distributed::api::create_multi_device_tensor(
       tensorShards, ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST, strategy);
   if (deviceRef) {
@@ -101,6 +102,6 @@ void run(const ::tt::target::ttnn::EmptyOp *op, ProgramContext &context) {
   } else {
     LOG_FATAL("Unsupported num shards");
   }
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 } // namespace tt::runtime::ttnn::operations::creation
diff --git a/runtime/lib/ttnn/operations/creation/full.cpp b/runtime/lib/ttnn/operations/creation/full.cpp
index 6a224f935d..7f6a6c0b6f 100644
--- a/runtime/lib/ttnn/operations/creation/full.cpp
+++ b/runtime/lib/ttnn/operations/creation/full.cpp
@@ -26,7 +26,7 @@ struct FullTensorConfig {
         fillValue(op->fill_value()), numShards(op->num_shards()),
         strategy(op->strategy()) {
 
-    layout = utils::inferLayoutFromTileShape(op->out());
+    layout = ::tt::runtime::ttnn::utils::inferLayoutFromTileShape(op->out());
 
     // TODO(bug #272), determine correct layout by tile shape in the future
     // currently tile shape is not set correctly, so as a workaround, hardcode
@@ -42,8 +42,7 @@ struct FullTensorConfig {
     }
 
     if (!utils::inSystemMemory(op->out())) {
-      memoryConfig =
-          ::tt::runtime::ttnn::operations::utils::createMemoryConfig(op->out());
+      memoryConfig = ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
     }
     validate();
   }
@@ -72,8 +71,8 @@ createFullOnMultiDevice(ProgramContext &context, FullTensorConfig &config,
   ::tt::tt_metal::DistributedTensorConfig strategy =
       config.distributedTensorConfig();
   std::vector<::ttnn::Tensor> tensorShards;
-  tensorShards.resize(config.numShards);
-  std::generate_n(tensorShards.begin(), config.numShards,
+  tensorShards.reserve(config.numShards);
+  std::generate_n(std::back_inserter(tensorShards), config.numShards,
                   [&config]() -> ::ttnn::Tensor {
                     return ::ttnn::full(config.shape, config.fillValue,
                                         config.dtype, config.layout);
@@ -116,6 +115,6 @@ void run(const ::tt::target::ttnn::FullOp *op, ProgramContext &context) {
   } else {
     LOG_FATAL("Unsupported num shards");
   }
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 } // namespace tt::runtime::ttnn::operations::creation
diff --git a/runtime/lib/ttnn/operations/data_movement/transpose.cpp b/runtime/lib/ttnn/operations/data_movement/transpose.cpp
index ef8dcf1b13..c86c0ee10a 100644
--- a/runtime/lib/ttnn/operations/data_movement/transpose.cpp
+++ b/runtime/lib/ttnn/operations/data_movement/transpose.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::data_movement {
 void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context) {
@@ -15,7 +16,7 @@ void run(const ::tt::target::ttnn::TransposeOp *op, ProgramContext &context) {
   int32_t dim0 = op->dim0();
   int32_t dim1 = op->dim1();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = ::ttnn::transpose(in, dim0, dim1, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
diff --git a/runtime/lib/ttnn/operations/deletion/deallocate.cpp b/runtime/lib/ttnn/operations/deletion/deallocate.cpp
index 6204945b34..e871a9ea64 100644
--- a/runtime/lib/ttnn/operations/deletion/deallocate.cpp
+++ b/runtime/lib/ttnn/operations/deletion/deallocate.cpp
@@ -11,13 +11,6 @@ void run(const ::tt::target::ttnn::DeallocateOp *op, ProgramContext &context) {
   ::ttnn::Tensor &tensor = tensorPool.at(op->in()->global_id());
   DEBUG_ASSERT(tensor.is_allocated());
   ::ttnn::deallocate(tensor, op->force());
-
-  // The tensor should be deallocated after the deallocate call.
-  // Still this assert may be hit in the future for multidevice/async ttnn
-  // support. In that case, we will reevaluate the assert/dealloc behaviour and
-  // adjust it accordingly.
-  //
-  DEBUG_ASSERT(!tensor.is_allocated());
   tensorPool.erase(op->in()->global_id());
 }
 } // namespace tt::runtime::ttnn::operations::deletion
diff --git a/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp b/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp
index 5913971192..ff47bdcdd8 100644
--- a/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/binary/binary.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/eltwise/binary/utils.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "ttnn/operations/eltwise/binary/binary_composite.hpp"
 
 namespace tt::runtime::ttnn::operations::binary {
@@ -26,7 +27,7 @@ static void runEltwiseBinaryOp(
 
   ::ttnn::DataType outputDataType = utils::getDataType(op->out());
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputDataType, outputMemoryConfig,
                               std::nullopt, std::nullopt, std::nullopt);
diff --git a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp
index 2a05d6246f..5ed4418f96 100644
--- a/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/binary/binary_composite.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/eltwise/binary/utils.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::binary::composite {
 
@@ -20,7 +21,7 @@ static void runEltwiseBinaryCompositeOp(
   getEltwiseBinaryOpInputTensors(op, tensorPool, &lhs, &rhs);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*lhs, *rhs, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
diff --git a/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp b/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp
index 6afde5d663..44f1413898 100644
--- a/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/ternary/ternary.cpp
@@ -22,7 +22,7 @@ static void runEltwiseTernaryWhereOp(
   getEltwiseTernaryOpInputTensors(op, tensorPool, &first, &second, &third);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*first, *second, *third, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
diff --git a/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp b/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp
index 5a09b43a9f..50c53f8dbe 100644
--- a/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/unary/unary.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/eltwise/unary/utils.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "ttmlir/Target/TTNN/program_generated.h"
 #include "ttnn/operations/copy.hpp"
 
@@ -22,7 +23,7 @@ static void runEltwiseUnaryOp(
   getEltwiseUnaryOpInputTensor(op, tensorPool, &in);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig, std::nullopt);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
@@ -39,7 +40,7 @@ static void runEltwiseUnaryWithFastAndApproximateModeOp(
   getEltwiseUnaryOpInputTensor(op, tensorPool, &in);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out =
       ttnnOp(*in, false /* parameter */, outputMemoryConfig, std::nullopt);
@@ -56,7 +57,7 @@ static void runEltwiseUnaryWithFloatParameterOp(
 
   float parameter = op->params_as_EltwiseOpWithFloatParams()->parameter();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = ttnnOp(*in, parameter, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
diff --git a/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp b/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp
index fd378d5a26..31514f0fe5 100644
--- a/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp
+++ b/runtime/lib/ttnn/operations/eltwise/unary/unary_composite.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/eltwise/unary/utils.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "ttnn/operations/eltwise/unary/unary_composite.hpp"
 
 namespace tt::runtime::ttnn::operations::unary::composite {
@@ -20,27 +21,26 @@ static void runEltwiseUnaryCompositeOp(
   getEltwiseUnaryOpInputTensor(op, tensorPool, &in);
 
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   ::ttnn::Tensor out = ttnnOp(*in, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 
-static void runEltwiseUnaryCompositeClampOP(
+static void runEltwiseUnaryCompositeClampOp(
     const ::tt::target::ttnn::EltwiseOp *op, ProgramTensorPool &tensorPool,
-    std::function<::ttnn::Tensor(const ::ttnn::Tensor &, float, float,
-                                 const ::tt::tt_metal::MemoryConfig &)>
-        ttnnOp) {
+    const std::function<::ttnn::Tensor(const ::ttnn::Tensor &, float, float,
+                                       const ::tt::tt_metal::MemoryConfig &)>
+        &ttnnOp) {
   ::ttnn::Tensor *in = nullptr;
   getEltwiseUnaryOpInputTensor(op, tensorPool, &in);
 
   float min = op->params_as_ClampOpParams()->min();
   float max = op->params_as_ClampOpParams()->max();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = ttnnOp(*in, min, max, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
-  return;
 }
 
 void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) {
@@ -51,7 +51,7 @@ void run(const ::tt::target::ttnn::EltwiseOp *op, ProgramContext &context) {
     break;
   }
   case ::tt::target::ttnn::EltwiseOpType::Clamp: {
-    runEltwiseUnaryCompositeClampOP(op, tensorPool, ::ttnn::clamp);
+    runEltwiseUnaryCompositeClampOp(op, tensorPool, ::ttnn::clamp);
     break;
   }
   case ::tt::target::ttnn::EltwiseOpType::Log1p: {
diff --git a/runtime/lib/ttnn/operations/embedding/embedding.cpp b/runtime/lib/ttnn/operations/embedding/embedding.cpp
index 47b27ca9ac..511d8256de 100644
--- a/runtime/lib/ttnn/operations/embedding/embedding.cpp
+++ b/runtime/lib/ttnn/operations/embedding/embedding.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::embedding {
 void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context) {
@@ -24,7 +25,7 @@ void run(const ::tt::target::ttnn::EmbeddingOp *op, ProgramContext &context) {
   auto embeddingsType = ::ttnn::operations::embedding::EmbeddingsType::GENERIC;
   ::ttnn::DataType outputDataType = utils::getDataType(op->out());
   ::ttnn::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out =
       ::ttnn::embedding(input, weight, padToken, layout, embeddingsType,
                         outputDataType, outputMemoryConfig);
diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
index 435607b87e..df3a34cf89 100644
--- a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
+++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.cpp
@@ -7,25 +7,6 @@
 
 namespace tt::runtime::ttnn::operations::utils {
 
-// TODO (bug #701)
-// Currently the memory layout/location in flatbuffer is incorrect
-// These methods are workarounds such that we query the info directly from the
-// TTNN tensor Ideally, we should be able to get all of this info directly from
-// the flatbuffer
-bool isOnHost(const ::ttnn::Tensor &tensor) {
-  // Currently only supports borrowed or owned host storage
-  return tensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED or
-         tensor.storage_type() == ::tt::tt_metal::StorageType::OWNED or
-         tensor.storage_type() ==
-             ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST;
-}
-
-bool isOnDevice(const ::ttnn::Tensor &tensor) {
-  // Currently only supports single device storage
-  return tensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE or
-         tensor.storage_type() == ::tt::tt_metal::StorageType::MULTI_DEVICE;
-}
-
 bool isTilized(const ::tt::target::TensorRef *tensorRef) {
   const ::tt::target::Dim2d *tileShape =
       tensorRef->desc()->layout()->memory_desc()->tile_shape();
@@ -43,93 +24,11 @@ bool inSystemMemory(const ::tt::target::TensorRef *tensorRef) {
          targetMemorySpace == ::tt::target::MemorySpace::SystemMMIO;
 }
 
-void updateTensorPool(ProgramTensorPool &tensorPool,
-                      const ::ttnn::Tensor &tensor, uint32_t outputGlobalId) {
-  if (tensorPool.isUserOutput(outputGlobalId)) {
-    tensorPool.copyTensorToUserOutput(outputGlobalId, tensor);
-  } else {
-    tensorPool.insert_or_assign(outputGlobalId, tensor);
-  }
-}
-
 ::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef) {
   return ::tt::runtime::ttnn::utils::toTTNNDataType(
       tensorRef->desc()->layout()->memory_desc()->data_type());
 }
 
-::ttnn::Layout
-inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef) {
-  const ::tt::target::Dim2d *tileShape =
-      tensorRef->desc()->layout()->memory_desc()->tile_shape();
-  LOG_ASSERT(::tt::runtime::ttnn::utils::isValidTileShape(tileShape));
-  if (tileShape->x() == 1 and tileShape->y() == 1) {
-    return ::ttnn::Layout::ROW_MAJOR;
-  }
-  return ::ttnn::Layout::TILE;
-}
-
-CoreRangeSet
-toCoreRangeSet(const ::flatbuffers::Vector<const ::tt::target::Dim2dRange *>
-                   *coreRangeSet) {
-  std::set<CoreRange> coreRanges;
-  for (::tt::target::Dim2dRange const *coreRange : *coreRangeSet) {
-    CoreCoord start(coreRange->loc().x(), coreRange->loc().y());
-    // End is inclusive
-    CoreCoord end(coreRange->loc().x() + coreRange->size().x() - 1,
-                  coreRange->loc().y() + coreRange->size().y() - 1);
-
-    coreRanges.emplace(start, end);
-  }
-  return CoreRangeSet(coreRanges);
-}
-
-// This method will soon be deprecated, prefer to use the method below
-//
-::tt::tt_metal::MemoryConfig
-createMemoryConfig(const ::tt::target::TensorRef *tensorRef) {
-  const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout();
-  const ::tt::target::TensorMemoryLayout targetMemoryLayout =
-      layout->memory_desc()->memory_layout();
-  const ::tt::target::MemorySpace targetMemorySpace =
-      layout->memory_desc()->memory_space();
-  const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
-      *targetCoreRangeSet = layout->core_range_set();
-  const ::flatbuffers::Vector<int32_t> *targetShardShape =
-      layout->memory_desc()->shape();
-  const ::tt::target::Dim2d *tileShape = layout->memory_desc()->tile_shape();
-
-  LOG_ASSERT(targetCoreRangeSet->size() == 1,
-             "Currently only single core range/grid is supported");
-
-  LOG_ASSERT(targetShardShape->size() == 2,
-             "Only 2D shard shape is supported in TTNN backend");
-
-  LOG_ASSERT(::tt::runtime::ttnn::utils::isValidTileShape(tileShape),
-             "Invalid tile shape");
-
-  CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet);
-  std::array<uint32_t, 2> ttnnShardShape;
-  std::copy(targetShardShape->begin(), targetShardShape->end(),
-            ttnnShardShape.begin());
-
-  ttnnShardShape[0] *= tileShape->y();
-  ttnnShardShape[1] *= tileShape->x();
-
-  ::tt::tt_metal::ShardSpec shardSpec(
-      ttnnCoreRangeSet, ttnnShardShape,
-      ::tt::tt_metal::ShardOrientation::ROW_MAJOR, false);
-
-  ::tt::tt_metal::TensorMemoryLayout ttnnMemLayout =
-      ::tt::runtime::ttnn::utils::toTTNNTensorMemoryLayout(targetMemoryLayout);
-
-  ::tt::tt_metal::BufferType ttnnBufferType =
-      ::tt::runtime::ttnn::utils::toTTNNBufferType(targetMemorySpace);
-
-  return {ttnnMemLayout, ttnnBufferType, shardSpec};
-}
-
-// Prefer to use this method over the one above
-//
 ::tt::tt_metal::MemoryConfig
 createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg,
                    const ::tt::target::TensorRef *tensorRef) {
@@ -144,7 +43,8 @@ createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg,
   const ::tt::target::LayoutDesc *layout = tensorRef->desc()->layout();
   const ::flatbuffers::Vector<const tt::target::Dim2dRange *>
       *targetCoreRangeSet = layout->core_range_set();
-  CoreRangeSet ttnnCoreRangeSet = toCoreRangeSet(targetCoreRangeSet);
+  CoreRangeSet ttnnCoreRangeSet =
+      ::tt::runtime::ttnn::utils::toCoreRangeSet(targetCoreRangeSet);
   const ::flatbuffers::Vector<int64_t> *shardShape =
       memcfg->shard_spec()->shard_shape();
   const ::tt::target::Dim2d *tileShape = layout->memory_desc()->tile_shape();
diff --git a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h
index b922e120a6..269e0328f9 100644
--- a/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h
+++ b/runtime/lib/ttnn/operations/include/tt/runtime/ttnn/operations/utils.h
@@ -13,32 +13,15 @@
 
 namespace tt::runtime::ttnn::operations::utils {
 
-bool isOnHost(const ::ttnn::Tensor &tensor);
-
-bool isOnDevice(const ::ttnn::Tensor &tensor);
-
 bool isTilized(const ::tt::target::TensorRef *tensorRef);
 
 bool inSystemMemory(const ::tt::target::TensorRef *tensorRef);
 
-void updateTensorPool(ProgramTensorPool &tensorPool,
-                      const ::ttnn::Tensor &tensor, uint32_t outputGlobalId);
-
 ::tt::target::MemorySpace
 getMemorySpace(const ::tt::target::TensorRef *tensorRef);
 
 ::ttnn::DataType getDataType(const ::tt::target::TensorRef *tensorRef);
 
-::ttnn::Layout
-inferLayoutFromTileShape(const ::tt::target::TensorRef *tensorRef);
-
-CoreRangeSet
-toCoreRangeSet(const ::flatbuffers::Vector<const ::tt::target::Dim2dRange *>
-                   *coreRangeSet);
-
-::tt::tt_metal::MemoryConfig
-createMemoryConfig(const ::tt::target::TensorRef *tensorRef);
-
 ::tt::tt_metal::MemoryConfig
 createMemoryConfig(const ::tt::target::MemoryConfigDesc *memcfg,
                    const ::tt::target::TensorRef *tensorRef);
diff --git a/runtime/lib/ttnn/operations/layout/from_device.cpp b/runtime/lib/ttnn/operations/layout/from_device.cpp
index b6820b6ec7..e26e3be2a3 100644
--- a/runtime/lib/ttnn/operations/layout/from_device.cpp
+++ b/runtime/lib/ttnn/operations/layout/from_device.cpp
@@ -12,10 +12,11 @@ void run(const ::tt::target::ttnn::FromDeviceOp *op, ProgramContext &context) {
   ProgramTensorPool &tensorPool = context.getTensorPool();
   const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id());
   DEBUG_ASSERT(inputTensor.is_allocated());
-  LOG_ASSERT(utils::isOnDevice(inputTensor),
-             "Calling ttnn::from_device on a host tensor");
+  DEBUG_ASSERT(
+      ::tt::runtime::ttnn::utils::isOnDevice(inputTensor.storage_type()),
+      "Calling ttnn::from_device on a host tensor");
 
   ::ttnn::Tensor out = ::ttnn::from_device(inputTensor);
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 } // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/layout/to_device.cpp b/runtime/lib/ttnn/operations/layout/to_device.cpp
index 34af89f504..414afc9f05 100644
--- a/runtime/lib/ttnn/operations/layout/to_device.cpp
+++ b/runtime/lib/ttnn/operations/layout/to_device.cpp
@@ -14,7 +14,7 @@ void run(const ::tt::target::ttnn::ToDeviceOp *op, ProgramContext &context) {
   ProgramTensorPool &tensorPool = context.getTensorPool();
   const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id());
   DEBUG_ASSERT(inputTensor.is_allocated());
-  DEBUG_ASSERT(utils::isOnHost(inputTensor),
+  DEBUG_ASSERT(::tt::runtime::ttnn::utils::isOnHost(inputTensor.storage_type()),
                "Calling ttnn::to_device on a device tensor");
   std::optional<::ttnn::MemoryConfig> memoryConfig = std::nullopt;
 
diff --git a/runtime/lib/ttnn/operations/layout/to_layout.cpp b/runtime/lib/ttnn/operations/layout/to_layout.cpp
index 5e78a67187..bf80ef292e 100644
--- a/runtime/lib/ttnn/operations/layout/to_layout.cpp
+++ b/runtime/lib/ttnn/operations/layout/to_layout.cpp
@@ -57,7 +57,7 @@ void run(const ::tt::target::ttnn::ToLayoutOp *op, ProgramContext &context) {
     out = ::ttnn::to_layout(inputTensor, layout, dtype, memoryConfig,
                             static_cast<::ttnn::Device *>(nullptr));
   }
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 
 } // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/layout/typecast.cpp b/runtime/lib/ttnn/operations/layout/typecast.cpp
index 5529c6112c..e59a64a401 100644
--- a/runtime/lib/ttnn/operations/layout/typecast.cpp
+++ b/runtime/lib/ttnn/operations/layout/typecast.cpp
@@ -17,7 +17,7 @@ void run(const ::tt::target::ttnn::TypecastOp *op, ProgramContext &context) {
       ::tt::runtime::ttnn::utils::toTTNNDataType(op->dtype());
 
   ::ttnn::Tensor out = ::ttnn::typecast(inputTensor, targetDataType);
-  utils::updateTensorPool(tensorPool, out, op->out()->global_id());
+  tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
 
 } // namespace tt::runtime::ttnn::operations::layout
diff --git a/runtime/lib/ttnn/operations/matmul/matmul.cpp b/runtime/lib/ttnn/operations/matmul/matmul.cpp
index abe71f9707..dc72ce7b9e 100644
--- a/runtime/lib/ttnn/operations/matmul/matmul.cpp
+++ b/runtime/lib/ttnn/operations/matmul/matmul.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 #include <optional>
 
 // ANCHOR: adding_an_op_matmul_runtime_operations
@@ -18,7 +19,7 @@ void run(const ::tt::target::ttnn::MatmulOp *op, ProgramContext &context) {
   DEBUG_ASSERT(rhs.is_allocated());
   ::ttnn::DataType outputDataType = utils::getDataType(op->out());
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
 
   std::optional<
       ::ttnn::operations::matmul::MatmulMultiCoreReuseMultiCast1DProgramConfig>
diff --git a/runtime/lib/ttnn/operations/normalization/softmax.cpp b/runtime/lib/ttnn/operations/normalization/softmax.cpp
index a83358567c..432f920956 100644
--- a/runtime/lib/ttnn/operations/normalization/softmax.cpp
+++ b/runtime/lib/ttnn/operations/normalization/softmax.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::normalization {
 void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context) {
@@ -14,7 +15,7 @@ void run(const ::tt::target::ttnn::SoftmaxOp *op, ProgramContext &context) {
   DEBUG_ASSERT(in.is_allocated());
   int32_t dimension = op->dimension();
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = ::ttnn::softmax(in, dimension, outputMemoryConfig);
   tensorPool.insert_or_assign(op->out()->global_id(), out);
 }
diff --git a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
index dfd8b9375e..23a935334f 100644
--- a/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
+++ b/runtime/lib/ttnn/operations/pool/maxpool2d.cpp
@@ -58,7 +58,8 @@ void run(const ::tt::target::ttnn::MaxPool2dOp *op, ProgramContext &context) {
         },
         targetDevice);
   }
-  ::ttnn::MemoryConfig outMemConfig = utils::createMemoryConfig(op->out());
+  ::ttnn::MemoryConfig outMemConfig =
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   ::ttnn::Tensor out = operation.invoke(
       0, input, op->batch_size(), op->input_height(), op->input_width(),
       op->channels(), {op->kernel_height(), op->kernel_width()},
diff --git a/runtime/lib/ttnn/operations/reduction/reduction.cpp b/runtime/lib/ttnn/operations/reduction/reduction.cpp
index 3af46efc9c..a74373ee9f 100644
--- a/runtime/lib/ttnn/operations/reduction/reduction.cpp
+++ b/runtime/lib/ttnn/operations/reduction/reduction.cpp
@@ -6,6 +6,7 @@
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/ttnn/operations/utils.h"
+#include "tt/runtime/ttnn/utils.h"
 
 namespace tt::runtime::ttnn::operations::reduction {
 static void runReductionOp(
@@ -17,7 +18,7 @@ static void runReductionOp(
         const std::optional<::ttnn::DeviceComputeKernelConfig> &, float)>
         &ttnnOp) {
   ::tt::tt_metal::MemoryConfig outputMemoryConfig =
-      utils::createMemoryConfig(op->out());
+      ::tt::runtime::ttnn::utils::createMemoryConfig(op->out());
   const ::ttnn::Tensor &in = tensorPool.at(op->in()->global_id());
   DEBUG_ASSERT(in.is_allocated());
 
diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
index 8cfa013891..b22bb26d2a 100644
--- a/runtime/lib/ttnn/program.cpp
+++ b/runtime/lib/ttnn/program.cpp
@@ -29,6 +29,7 @@
 #include "tt/runtime/detail/debug.h"
 #include "tt/runtime/detail/logger.h"
 #include "tt/runtime/ttnn/types.h"
+#include "tt/runtime/ttnn/utils.h"
 #include "tt/runtime/utils.h"
 #include "ttmlir/Target/TTNN/program_generated.h"
 
@@ -46,28 +47,19 @@ static ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) {
 
 class ProgramExecutor {
 public:
-  ProgramExecutor(Binary &executableHandle, const TensorMap &liveTensors,
-                  const std::unordered_set<uint32_t> &programInputs,
-                  const std::unordered_set<uint32_t> &programOutputs,
-                  ::ttnn::MeshDevice *meshDevice)
+  ProgramExecutor(
+      const Binary &executableHandle,
+      const std::unordered_map<uint32_t, ::ttnn::Tensor *> &liveTensors,
+      const std::vector<uint32_t> &programInputs,
+      const std::vector<uint32_t> &programOutputs,
+      ::ttnn::MeshDevice *meshDevice)
       : executableHandle(executableHandle),
         context(ProgramContext(liveTensors, programInputs, programOutputs,
                                meshDevice)) {}
 
   void runCallback(Binary &executableHandle,
                    const ::tt::target::ttnn::Operation *opContext,
-                   ProgramContext *programContext) {
-    if (auto callback = debug::Hooks::get().getOperatorCallback(); callback) {
-      std::shared_ptr<void> programContextPtr =
-          ::tt::runtime::utils::unsafe_borrow_shared(programContext);
-      std::shared_ptr<void> opContextPtr =
-          ::tt::runtime::utils::unsafe_borrow_shared(
-              const_cast<::tt::target::ttnn::Operation *>(opContext));
-      (*callback)(executableHandle,
-                  CallbackContext(programContextPtr, DeviceRuntime::TTNN),
-                  OpContext(opContextPtr, DeviceRuntime::TTNN));
-    }
-  }
+                   ProgramContext *programContext);
 
   void execute(const ::tt::target::ttnn::Program *program) {
     for (const ::tt::target::ttnn::Operation *op : *program->operations()) {
@@ -79,6 +71,9 @@ class ProgramExecutor {
   }
 
   ProgramContext &getContext() { return context; }
+  std::vector<Tensor> gatherOutputTensors() {
+    return context.getTensorPool().gatherOutputTensors();
+  }
 
 private:
   Binary executableHandle;
@@ -87,6 +82,21 @@ class ProgramExecutor {
   void runEltwiseOperation(const ::tt::target::ttnn::EltwiseOp *op);
 };
 
+void ProgramExecutor::runCallback(
+    Binary &executableHandle, const ::tt::target::ttnn::Operation *opContext,
+    ProgramContext *programContext) {
+  if (auto callback = debug::Hooks::get().getOperatorCallback(); callback) {
+    std::shared_ptr<void> programContextPtr =
+        ::tt::runtime::utils::unsafe_borrow_shared(programContext);
+    std::shared_ptr<void> opContextPtr =
+        ::tt::runtime::utils::unsafe_borrow_shared(
+            const_cast<::tt::target::ttnn::Operation *>(opContext));
+    (*callback)(executableHandle,
+                CallbackContext(programContextPtr, DeviceRuntime::TTNN),
+                OpContext(opContextPtr, DeviceRuntime::TTNN));
+  }
+}
+
 void ProgramExecutor::runEltwiseOperation(
     const ::tt::target::ttnn::EltwiseOp *op) {
   auto runUnaryOp = [&]() {
@@ -193,6 +203,26 @@ void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) {
 }
 
 // Nop is single input, output tensor where input is returned as output.
+static bool isNopProgram(const ::tt::target::ttnn::Program *program) {
+  return program->inputs()->size() == 1 && program->outputs()->size() == 1 &&
+         program->inputs()->Get(0)->global_id() ==
+             program->outputs()->Get(0)->global_id();
+}
+
+static ::ttnn::Tensor
+handleNopProgram(::tt::target::ttnn::Program const *program,
+                 std::vector<::ttnn::Tensor *> const &inputs) {
+  const ::ttnn::Tensor &input = *inputs[0];
+  ::ttnn::Tensor output =
+      ::ttnn::zeros(input.get_shape(), input.get_dtype(), input.get_layout());
+  const void *src = ::tt::tt_metal::get_raw_host_data_ptr(input);
+  void *dst = ::tt::tt_metal::get_raw_host_data_ptr(output);
+  std::memcpy(dst, src, input.volume() * input.element_size());
+  return output;
+}
+
+namespace legacy {
+
 static bool handleNopProgram(::tt::target::ttnn::Program const *program,
                              std::vector<::ttnn::Tensor *> const &inputs,
                              std::vector<::ttnn::Tensor *> const &outputs) {
@@ -221,8 +251,8 @@ void runProgram(::ttnn::MeshDevice &meshDevice, Binary &executableHandle,
   if (handleNopProgram(program, inputs, outputs)) {
     return;
   }
-  TensorMap liveTensors;
-  std::unordered_set<uint32_t> programInputs;
+  std::unordered_map<uint32_t, ::ttnn::Tensor *> liveTensors;
+  std::vector<uint32_t> programInputs;
   int inputIndex = 0;
   LOG_ASSERT(program->inputs()->size() == inputs.size(),
              "Program input size mismatch: ", program->inputs()->size(),
@@ -231,21 +261,68 @@ void runProgram(::ttnn::MeshDevice &meshDevice, Binary &executableHandle,
     auto [iter, inserted] =
         liveTensors.try_emplace(input->global_id(), inputs[inputIndex++]);
     LOG_ASSERT(inserted, "Duplicate input tensor");
-    programInputs.emplace(input->global_id());
+    programInputs.push_back(input->global_id());
   }
 
   int outputIndex = 0;
-  std::unordered_set<uint32_t> programOutputs;
+  std::vector<uint32_t> programOutputs;
   LOG_ASSERT(program->outputs()->size() == outputs.size());
   for (::tt::target::TensorRef const *output : *program->outputs()) {
     auto [iter, inserted] =
         liveTensors.try_emplace(output->global_id(), outputs[outputIndex++]);
     LOG_ASSERT(inserted, "Duplicate output tensor");
-    programOutputs.emplace(output->global_id());
+    programOutputs.push_back(output->global_id());
+  }
+  ProgramExecutor executor(executableHandle, liveTensors, programInputs,
+                           programOutputs, &meshDevice);
+  executor.execute(program);
+  for (uint32_t outputId : programOutputs) {
+    const ::ttnn::Tensor &src =
+        executor.getContext().getTensorPool().at(outputId);
+    const ::ttnn::Tensor &dst = *outputs[outputId];
+    size_t srcSize = src.volume() * src.element_size();
+    size_t dstSize = dst.volume() * dst.element_size();
+    LOG_ASSERT(srcSize == dstSize, "Output tensor size mismatch");
+    void *srcPtr = ::tt::tt_metal::get_raw_host_data_ptr(src);
+    void *dstPtr = ::tt::tt_metal::get_raw_host_data_ptr(dst);
+    std::memcpy(dstPtr, srcPtr, dstSize);
+  }
+}
+} // namespace legacy
+
+std::vector<Tensor> runProgram(::ttnn::MeshDevice &meshDevice,
+                               Binary executableHandle,
+                               std::uint32_t programIndex,
+                               std::vector<::ttnn::Tensor *> const &inputs) {
+  ::tt::target::ttnn::TTNNBinary const &fbb = *getBinary(executableHandle);
+  ::tt::target::ttnn::Program const *program =
+      fbb.programs()->Get(programIndex);
+  if (isNopProgram(program)) {
+    Tensor out =
+        utils::createRuntimeTensorFromTTNN(handleNopProgram(program, inputs));
+    return {out};
+  }
+  std::unordered_map<uint32_t, ::ttnn::Tensor *> liveTensors;
+  std::vector<uint32_t> programInputs;
+  int inputIndex = 0;
+  LOG_ASSERT(program->inputs()->size() == inputs.size(),
+             "Program input size mismatch: ", program->inputs()->size(),
+             " != ", inputs.size());
+  for (::tt::target::TensorRef const *input : *program->inputs()) {
+    auto [iter, inserted] =
+        liveTensors.try_emplace(input->global_id(), inputs[inputIndex++]);
+    LOG_ASSERT(inserted, "Duplicate input tensor");
+    programInputs.push_back(input->global_id());
+  }
+  std::vector<uint32_t> programOutputs;
+  for (::tt::target::TensorRef const *output : *program->outputs()) {
+    programOutputs.push_back(output->global_id());
   }
   ProgramExecutor executor(executableHandle, liveTensors, programInputs,
                            programOutputs, &meshDevice);
   executor.execute(program);
+  std::vector<Tensor> outputTensors = executor.gatherOutputTensors();
+  return outputTensors;
 }
 
 } // namespace tt::runtime::ttnn
diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
index 86fd2d25c6..44d49638e3 100644
--- a/runtime/lib/ttnn/runtime.cpp
+++ b/runtime/lib/ttnn/runtime.cpp
@@ -21,6 +21,267 @@ using ::tt::tt_metal::DistributedTensorConfig;
 using ::tt::tt_metal::OwnedStorage;
 using ::tt::tt_metal::raise_unsupported_storage;
 
+namespace detail {
+class LayoutConverter {
+public:
+  LayoutDesc inputDesc;
+  LayoutDesc outputDesc;
+  bool shouldTilize = false;
+  bool shouldUntilize = false;
+  bool shouldTypecast = false;
+  bool shouldToDevice = false;
+  bool shouldToMemoryConfig = false;
+  LayoutConverter(const LayoutDesc &inputDesc, const LayoutDesc &outputDesc)
+      : inputDesc(inputDesc), outputDesc(outputDesc) {
+    shouldTilize = (inputDesc.layout == ::ttnn::Layout::ROW_MAJOR and
+                    outputDesc.layout == ::ttnn::Layout::TILE);
+    shouldUntilize = (inputDesc.layout == ::ttnn::Layout::TILE and
+                      outputDesc.layout == ::ttnn::Layout::ROW_MAJOR);
+
+    shouldTypecast = (inputDesc.dataType != outputDesc.dataType);
+
+    shouldToDevice = (inputDesc.isOnHost() and outputDesc.isOnDevice());
+
+    shouldToMemoryConfig =
+        (not shouldToDevice and outputDesc.isOnDevice() and
+         (inputDesc.memoryConfig != outputDesc.memoryConfig));
+  }
+
+  ::ttnn::Tensor
+  convertTensorLayout(const ::ttnn::Tensor &input,
+                      std::optional<DeviceVariant> targetDevice) {
+    if (inputDesc.isOnHost()) {
+      return convertHostTensorLayout(input, targetDevice);
+    }
+    return convertDeviceTensorLayout(input);
+  }
+
+private:
+  ::ttnn::Tensor toLayoutIfNeeded(const ::ttnn::Tensor &input) {
+    if (shouldTilize) {
+      return ::ttnn::to_layout(input, ::ttnn::Layout::TILE, std::nullopt,
+                               std::nullopt,
+                               static_cast<::ttnn::Device *>(nullptr));
+    }
+    if (shouldUntilize) {
+      return ::ttnn::to_layout(input, ::ttnn::Layout::ROW_MAJOR, std::nullopt,
+                               std::nullopt,
+                               static_cast<::ttnn::Device *>(nullptr));
+    }
+    return input;
+  }
+
+  ::ttnn::Tensor typecastIfNeeded(const ::ttnn::Tensor &input) {
+    if (shouldTypecast) {
+      return ::ttnn::typecast(input, outputDesc.dataType);
+    }
+    return input;
+  }
+
+  ::ttnn::Tensor toDeviceIfNeeded(const ::ttnn::Tensor &input,
+                                  std::optional<DeviceVariant> targetDevice,
+                                  bool force = false) {
+    if (shouldToDevice or force) {
+      LOG_ASSERT(targetDevice.has_value());
+      LOG_ASSERT(outputDesc.memoryConfig.has_value());
+      return std::visit(
+          [&](auto &&targetDevice) -> ::ttnn::Tensor {
+            return ::ttnn::to_device(input, &(targetDevice.get()),
+                                     outputDesc.memoryConfig.value());
+          },
+          targetDevice.value());
+    }
+    return input;
+  }
+
+  ::ttnn::Tensor toMemoryConfigIfNeeded(const ::ttnn::Tensor &input) {
+    if (shouldToMemoryConfig) {
+      LOG_ASSERT(outputDesc.memoryConfig.has_value());
+      return ::ttnn::to_memory_config(input, outputDesc.memoryConfig.value());
+    }
+    return input;
+  }
+
+  //
+  // Host input tensor APIs
+  //
+  ::ttnn::Tensor
+  handleHostInputNoLayoutNoTypecast(const ::ttnn::Tensor &input,
+                                    std::optional<DeviceVariant> targetDevice) {
+    ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  ::ttnn::Tensor
+  handleHostInputLayoutNoTypecast(const ::ttnn::Tensor &input,
+                                  std::optional<DeviceVariant> targetDevice) {
+    if (shouldUntilize) {
+      ::ttnn::Tensor out = toLayoutIfNeeded(input);
+      out = toDeviceIfNeeded(out, targetDevice);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+
+    if (shouldTilize and outputDesc.dataType == ::ttnn::DataType::BFLOAT16) {
+      ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice);
+      out = toLayoutIfNeeded(out);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+
+    if (shouldTilize and outputDesc.dataType != ::ttnn::DataType::BFLOAT16) {
+      ::ttnn::Tensor out = toLayoutIfNeeded(input);
+      out = toDeviceIfNeeded(out, targetDevice);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+    LOG_FATAL("Unreachable code path");
+  }
+
+  ::ttnn::Tensor
+  handleHostInputNoLayoutTypecast(const ::ttnn::Tensor &input,
+                                  std::optional<DeviceVariant> targetDevice) {
+    if (outputDesc.layout == ::ttnn::Layout::TILE) {
+      ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice);
+      out = typecastIfNeeded(out);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+
+    if (outputDesc.layout != ::ttnn::Layout::TILE) {
+      ::ttnn::Tensor out = typecastIfNeeded(input);
+      out = toDeviceIfNeeded(out, targetDevice);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+    LOG_FATAL("Unreachable code path");
+  }
+
+  ::ttnn::Tensor
+  handleHostInputLayoutTypecast(const ::ttnn::Tensor &input,
+                                std::optional<DeviceVariant> targetDevice) {
+    if (shouldUntilize) {
+      ::ttnn::Tensor out = typecastIfNeeded(input);
+      out = toLayoutIfNeeded(out);
+      out = toDeviceIfNeeded(out, targetDevice);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+
+    if (shouldTilize and inputDesc.dataType == ::ttnn::DataType::BFLOAT16) {
+      ::ttnn::Tensor out = toDeviceIfNeeded(input, targetDevice);
+      out = toLayoutIfNeeded(out);
+      out = typecastIfNeeded(out);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+
+    if (shouldTilize and outputDesc.dataType == ::ttnn::DataType::BFLOAT16) {
+      ::ttnn::Tensor out = typecastIfNeeded(input);
+      out = toDeviceIfNeeded(out, targetDevice);
+      out = toLayoutIfNeeded(input);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+
+    if (shouldTilize and inputDesc.dataType != ::ttnn::DataType::BFLOAT16 and
+        outputDesc.dataType != ::ttnn::DataType::BFLOAT16) {
+      ::ttnn::Tensor out = typecastIfNeeded(input);
+      out = toLayoutIfNeeded(out);
+      out = toDeviceIfNeeded(out, targetDevice);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+
+    LOG_FATAL("Unreachable code path");
+  }
+
+  ::ttnn::Tensor
+  convertHostTensorLayout(const ::ttnn::Tensor &input,
+                          std::optional<DeviceVariant> targetDevice) {
+    bool shouldToLayout = (shouldTilize or shouldUntilize);
+    LOG_ASSERT(not shouldToDevice or targetDevice.has_value(),
+               "Target device must be provided for ToDevice");
+    if (not shouldToLayout and not shouldTypecast) {
+      return handleHostInputNoLayoutNoTypecast(input, targetDevice);
+    }
+    if (shouldToLayout and not shouldTypecast) {
+      return handleHostInputLayoutNoTypecast(input, targetDevice);
+    }
+    if (not shouldToLayout and shouldTypecast) {
+      return handleHostInputNoLayoutTypecast(input, targetDevice);
+    }
+    if (shouldToLayout and shouldTypecast) {
+      return handleHostInputLayoutTypecast(input, targetDevice);
+    }
+    LOG_FATAL("Unreachable code path");
+  }
+
+  //
+  // Device input tensor APIs
+  //
+  ::ttnn::Tensor
+  handleDeviceInputNoLayoutNoTypecast(const ::ttnn::Tensor &input) {
+    return toMemoryConfigIfNeeded(input);
+  }
+
+  ::ttnn::Tensor
+  handleDeviceInputLayoutNoTypecast(const ::ttnn::Tensor &input) {
+    if (shouldUntilize) {
+      LOG_WARNING(
+          "Device to device tilize is not fully supported on the ttnn side");
+    }
+    ::ttnn::Tensor out = toLayoutIfNeeded(input);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  ::ttnn::Tensor
+  handleDeviceInputNoLayoutTypecast(const ::ttnn::Tensor &input) {
+    ::ttnn::Tensor out = typecastIfNeeded(input);
+    out = toMemoryConfigIfNeeded(out);
+    return out;
+  }
+
+  ::ttnn::Tensor handleDeviceInputLayoutTypecast(const ::ttnn::Tensor &input) {
+    if (shouldUntilize) {
+      LOG_WARNING(
+          "Device to device untilize is not fully supported on the ttnn side");
+      ::ttnn::Tensor out = typecastIfNeeded(input);
+      out = toLayoutIfNeeded(out);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+    if (shouldTilize) {
+      ::ttnn::Tensor out = toLayoutIfNeeded(input);
+      out = typecastIfNeeded(out);
+      out = toMemoryConfigIfNeeded(out);
+      return out;
+    }
+    LOG_FATAL("Unreachable code path");
+  }
+
+  ::ttnn::Tensor convertDeviceTensorLayout(const ::ttnn::Tensor &input) {
+    bool shouldToLayout = (shouldTilize or shouldUntilize);
+    if (not shouldToLayout and not shouldTypecast) {
+      return handleDeviceInputNoLayoutNoTypecast(input);
+    }
+    if (shouldToLayout and not shouldTypecast) {
+      return handleDeviceInputLayoutNoTypecast(input);
+    }
+    if (not shouldToLayout and shouldTypecast) {
+      return handleDeviceInputNoLayoutTypecast(input);
+    }
+    if (shouldToLayout and shouldTypecast) {
+      return handleDeviceInputLayoutTypecast(input);
+    }
+    LOG_FATAL("Unreachable code path");
+  }
+};
+
+} // namespace detail
+
 template <typename StorageType, typename ElementType>
 static StorageType createStorage(ElementType *ptr, std::uint32_t numElements) {
   if constexpr (std::is_same_v<StorageType, BorrowedStorage>) {
@@ -76,6 +337,20 @@ static Tensor createNullTensor() {
   return Tensor(nullptr, nullptr, DeviceRuntime::TTNN);
 }
 
+static DeviceVariant getTargetDevice(::ttnn::MeshDevice &meshDevice) {
+  if (meshDevice.num_devices() == 1) {
+    return std::ref(*(meshDevice.get_device_index(0)));
+  }
+  return std::ref(meshDevice);
+}
+
+static ::tt::target::ttnn::TTNNBinary const *getBinary(Flatbuffer binary) {
+  bool isTTNN = ::tt::target::ttnn::SizePrefixedTTNNBinaryBufferHasIdentifier(
+      binary.handle.get());
+  LOG_ASSERT(isTTNN, "Unsupported binary format");
+  return ::tt::target::ttnn::GetSizePrefixedTTNNBinary(binary.handle.get());
+}
+
 Tensor createTensor(std::shared_ptr<void> data,
                     std::vector<std::uint32_t> const &shape,
                     std::vector<std::uint32_t> const &stride,
@@ -89,7 +364,7 @@ Tensor createTensor(std::shared_ptr<void> data,
       createStorage<BorrowedStorage>(data.get(), numElements, dataType),
       ::ttnn::Shape(small_vector_shape), utils::toTTNNDataType(dataType),
       ::ttnn::Layout::ROW_MAJOR);
-  return Tensor(std::static_pointer_cast<void>(tensor), data,
+  return Tensor(std::static_pointer_cast<void>(tensor), nullptr,
                 DeviceRuntime::TTNN);
 }
 
@@ -100,8 +375,8 @@ createTensor(std::vector<std::shared_ptr<void>> &data,
              ::tt::target::DataType dataType,
              std::unordered_map<std::string, std::string> const &strategy) {
   std::vector<::ttnn::Tensor> tensorShards;
-  tensorShards.resize(data.size());
-  std::transform(data.begin(), data.end(), tensorShards.begin(),
+  tensorShards.reserve(data.size());
+  std::transform(data.begin(), data.end(), std::back_inserter(tensorShards),
                  [&](std::shared_ptr<void> &dataShard) -> ::ttnn::Tensor {
                    return createOwnedTensor(dataShard, shape, stride, itemsize,
                                             dataType);
@@ -112,10 +387,7 @@ createTensor(std::vector<std::shared_ptr<void>> &data,
       ::ttnn::distributed::api::create_multi_device_tensor(
           tensorShards, ::tt::tt_metal::StorageType::MULTI_DEVICE_HOST,
           distributionStrategy));
-  std::shared_ptr<std::vector<std::shared_ptr<void>>> borrowedData =
-      std::make_shared<std::vector<std::shared_ptr<void>>>(data);
-  return Tensor(std::static_pointer_cast<void>(tensor),
-                std::static_pointer_cast<void>(borrowedData),
+  return Tensor(std::static_pointer_cast<void>(tensor), nullptr,
                 DeviceRuntime::TTNN);
 }
 
@@ -166,34 +438,112 @@ void deallocateBuffers(Device deviceHandle) {
   }
 }
 
-Event submit(Device deviceHandle, Binary executableHandle,
-             std::uint32_t programIndex,
-             std::vector<Tensor> const &inputHandles,
-             std::vector<Tensor> const &outputHandles) {
+Tensor toHost(Tensor tensor, bool untilize) {
+  const ::ttnn::Tensor &deviceTensor =
+      tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  std::shared_ptr<::ttnn::Tensor> hostTensor =
+      std::make_shared<::ttnn::Tensor>(::ttnn::from_device(deviceTensor));
+  if (untilize) {
+    hostTensor = std::make_shared<::ttnn::Tensor>(::ttnn::to_layout(
+        *hostTensor, ::ttnn::Layout::ROW_MAJOR, std::nullopt, std::nullopt,
+        static_cast<::ttnn::Device *>(nullptr)));
+  }
+  return Tensor(std::static_pointer_cast<void>(hostTensor), nullptr,
+                DeviceRuntime::TTNN);
+}
+
+Tensor toDevice(Tensor tensor, Device device) {
+  const ::ttnn::Tensor &hostTensor =
+      tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  auto &meshDevice = device.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
+  DeviceVariant targetDevice = getTargetDevice(meshDevice);
+  std::shared_ptr<::ttnn::Tensor> deviceTensor = std::visit(
+      [&hostTensor](auto &&device) -> std::shared_ptr<::ttnn::Tensor> {
+        return std::make_shared<::ttnn::Tensor>(
+            ::ttnn::to_device(hostTensor, &(device.get()), std::nullopt));
+      },
+      targetDevice);
+  return Tensor(std::static_pointer_cast<void>(deviceTensor), nullptr,
+                DeviceRuntime::TTNN);
+}
+
+Tensor toDevice(Tensor tensor, Device device, Layout layout) {
+  const ::ttnn::Tensor &hostTensor =
+      tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  LOG_ASSERT(utils::isOnHost(hostTensor.storage_type()),
+             "Input tensor to ToDevice must be on host");
+  const ::ttnn::Layout &inputLayout = hostTensor.get_layout();
+  const ::ttnn::DataType &inputDataType = hostTensor.get_dtype();
+  LayoutDesc inputLayoutDesc(::ttnn::BufferType::SYSTEM_MEMORY, inputLayout,
+                             inputDataType, std::nullopt);
+  const LayoutDesc &outputLayoutDesc =
+      layout.as<LayoutDesc>(DeviceRuntime::TTNN);
+  LOG_ASSERT(outputLayoutDesc.isOnDevice(), "Output layout must be on device");
+  LOG_ASSERT(outputLayoutDesc.memoryConfig.has_value(),
+             "Output layout must have memory config");
   ::ttnn::MeshDevice &meshDevice =
-      deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
-  std::vector<::ttnn::Tensor *> inputs;
-  inputs.reserve(inputHandles.size());
-  for (auto &input : inputHandles) {
-    LOG_ASSERT(input.matchesRuntime(DeviceRuntime::TTNN));
-    inputs.push_back(static_cast<::ttnn::Tensor *>(input.handle.get()));
-  }
+      device.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
+  DeviceVariant targetDevice = getTargetDevice(meshDevice);
 
-  std::vector<::ttnn::Tensor *> outputs;
-  outputs.reserve(outputHandles.size());
-  for (auto &output : outputHandles) {
-    LOG_ASSERT(output.matchesRuntime(DeviceRuntime::TTNN));
-    outputs.push_back(static_cast<::ttnn::Tensor *>(output.handle.get()));
-  }
+  detail::LayoutConverter converter(inputLayoutDesc, outputLayoutDesc);
+  std::shared_ptr<::ttnn::Tensor> out = std::make_shared<::ttnn::Tensor>(
+      converter.convertTensorLayout(hostTensor, targetDevice));
 
-  tt::runtime::ttnn::runProgram(meshDevice, executableHandle, programIndex,
-                                inputs, outputs);
-  return Event(nullptr, DeviceRuntime::TTNN);
+  return Tensor(std::static_pointer_cast<void>(out), nullptr,
+                DeviceRuntime::TTNN);
 }
 
-void wait(Event event) {
-  // Not implemented
-  LOG_ASSERT(event.matchesRuntime(DeviceRuntime::TTNN));
+Tensor toLayout(Tensor tensor, Layout layout) {
+  const ::ttnn::Tensor &inputTensor =
+      tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  const ::ttnn::StorageType &inputStorageType = inputTensor.storage_type();
+  const ::ttnn::BufferType &inputBufferType =
+      utils::isOnHost(inputStorageType)
+          ? ::ttnn::BufferType::SYSTEM_MEMORY
+          : inputTensor.memory_config().buffer_type;
+  const ::ttnn::Layout &inputLayout = inputTensor.get_layout();
+  const ::ttnn::DataType &inputDataType = inputTensor.get_dtype();
+  std::optional<::ttnn::MemoryConfig> inputMemoryConfig = std::nullopt;
+  if (utils::isOnDevice(inputStorageType)) {
+    inputMemoryConfig =
+        std::make_optional<::ttnn::MemoryConfig>(inputTensor.memory_config());
+  }
+  LayoutDesc inputLayoutDesc(inputBufferType, inputLayout, inputDataType,
+                             inputMemoryConfig);
+  const LayoutDesc &outputLayoutDesc =
+      layout.as<LayoutDesc>(DeviceRuntime::TTNN);
+  LOG_ASSERT(inputLayoutDesc.isOnHost() == outputLayoutDesc.isOnHost(),
+             "Input and output layout must be on the same memory space");
+
+  detail::LayoutConverter converter(inputLayoutDesc, outputLayoutDesc);
+  std::shared_ptr<::ttnn::Tensor> out = std::make_shared<::ttnn::Tensor>(
+      converter.convertTensorLayout(inputTensor, std::nullopt));
+
+  return Tensor(std::static_pointer_cast<void>(out), nullptr,
+                DeviceRuntime::TTNN);
+}
+
+Layout getLayout(Binary executableHandle, std::uint32_t programIndex,
+                 std::uint32_t inputIndex) {
+  const ::tt::target::ttnn::TTNNBinary &fbb = *getBinary(executableHandle);
+  LOG_ASSERT(programIndex < fbb.programs()->size(), "Invalid program index");
+  const ::tt::target::ttnn::Program *program =
+      fbb.programs()->Get(programIndex);
+  LOG_ASSERT(inputIndex < program->inputs()->size(), "Invalid input index");
+  const ::tt::target::TensorRef *input = program->inputs()->Get(inputIndex);
+  ::ttnn::BufferType inputBufferType = utils::toTTNNBufferType(
+      input->desc()->layout()->memory_desc()->memory_space());
+  ::ttnn::Layout inputLayout = utils::inferLayoutFromTileShape(input);
+  ::ttnn::DataType inputDataType = utils::toTTNNDataType(
+      input->desc()->layout()->memory_desc()->data_type());
+  std::optional<::ttnn::MemoryConfig> inputMemoryConfig = std::nullopt;
+  if (inputBufferType != ::ttnn::BufferType::SYSTEM_MEMORY) {
+    inputMemoryConfig = utils::createMemoryConfig(input);
+  }
+  std::shared_ptr<LayoutDesc> layoutDesc = std::make_shared<LayoutDesc>(
+      inputBufferType, inputLayout, inputDataType, inputMemoryConfig);
+  return Layout(std::static_pointer_cast<void>(layoutDesc),
+                DeviceRuntime::TTNN);
 }
 
 std::string getOpDebugString(OpContext opContextHandle) {
@@ -299,7 +649,7 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
     return createNullTensor();
   }
   default: {
-    throw std::runtime_error("Unsupported operation type");
+    LOG_FATAL("Unsupported operation type");
   }
   }
 
@@ -326,12 +676,13 @@ Tensor getOpOutputTensor(OpContext opContextHandle,
       outCopy.shape().value, ::ttnn::DataType::FLOAT32,
       ::ttnn::Layout::ROW_MAJOR);
 
-  return Tensor(std::static_pointer_cast<void>(tensor), data,
+  return Tensor(std::static_pointer_cast<void>(tensor), nullptr,
                 DeviceRuntime::TTNN);
 }
 
 std::vector<float> getTensorData(Tensor tensor) {
-  ::ttnn::Tensor *nnTensor = static_cast<::ttnn::Tensor *>(tensor.handle.get());
+  const ::ttnn::Tensor *nnTensor =
+      static_cast<::ttnn::Tensor *>(tensor.handle.get());
   if (nnTensor == nullptr) {
     return {};
   }
@@ -341,4 +692,79 @@ std::vector<float> getTensorData(Tensor tensor) {
                             static_cast<float *>(dataPtr) + nnTensor->volume());
 }
 
+void memcpy(void *dst, Tensor src) {
+  const ::ttnn::Tensor &srcTensor = src.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  void *srcPtr = ::tt::tt_metal::get_raw_host_data_ptr(srcTensor);
+  size_t size = srcTensor.volume() * srcTensor.element_size();
+  std::memcpy(dst, srcPtr, size);
+}
+
+void memcpy(Tensor dst, Tensor src) {
+  ::ttnn::Tensor &dstTensor = dst.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  const ::ttnn::Tensor &srcTensor = src.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  LOG_ASSERT(srcTensor.volume() * srcTensor.element_size() ==
+                 dstTensor.volume() * dstTensor.element_size(),
+             "Input output tensor size mismatch in memcpy: ",
+             srcTensor.volume(), " * ", srcTensor.element_size(),
+             " != ", dstTensor.volume(), " * ", dstTensor.element_size());
+  void *dstPtr = ::tt::tt_metal::get_raw_host_data_ptr(dstTensor);
+  ::tt::runtime::ttnn::memcpy(dstPtr, src);
+}
+
+void deallocateTensor(Tensor &tensor, bool force) {
+  ::ttnn::Tensor &ttnnTensor = tensor.as<::ttnn::Tensor>(DeviceRuntime::TTNN);
+  ::ttnn::deallocate(ttnnTensor, force);
+}
+
+namespace legacy {
+void wait(Event event) {
+  // Not implemented
+  LOG_ASSERT(event.matchesRuntime(DeviceRuntime::TTNN));
+}
+
+Event submit(Device deviceHandle, Binary executableHandle,
+             std::uint32_t programIndex,
+             std::vector<Tensor> const &inputHandles,
+             std::vector<Tensor> const &outputHandles) {
+  ::ttnn::MeshDevice &meshDevice =
+      deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
+  std::vector<::ttnn::Tensor *> inputs;
+  inputs.reserve(inputHandles.size());
+  for (auto &input : inputHandles) {
+    LOG_ASSERT(input.matchesRuntime(DeviceRuntime::TTNN));
+    inputs.push_back(static_cast<::ttnn::Tensor *>(input.handle.get()));
+  }
+
+  std::vector<::ttnn::Tensor *> outputs;
+  outputs.reserve(outputHandles.size());
+  for (auto &output : outputHandles) {
+    LOG_ASSERT(output.matchesRuntime(DeviceRuntime::TTNN));
+    outputs.push_back(static_cast<::ttnn::Tensor *>(output.handle.get()));
+  }
+
+  tt::runtime::ttnn::legacy::runProgram(meshDevice, executableHandle,
+                                        programIndex, inputs, outputs);
+  return Event(nullptr, DeviceRuntime::TTNN);
+}
+} // namespace legacy
+
+std::vector<Tensor> submit(Device deviceHandle, Binary executableHandle,
+                           std::uint32_t programIndex,
+                           std::vector<Tensor> const &inputHandles) {
+  ::ttnn::MeshDevice &meshDevice =
+      deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN);
+  std::vector<::ttnn::Tensor *> inputs;
+  inputs.reserve(inputHandles.size());
+  std::transform(inputHandles.begin(), inputHandles.end(),
+                 std::back_inserter(inputs),
+                 [](const Tensor &input) -> ::ttnn::Tensor * {
+                   LOG_ASSERT(input.matchesRuntime(DeviceRuntime::TTNN));
+                   return static_cast<::ttnn::Tensor *>(input.handle.get());
+                 });
+  std::vector<Tensor> outputs = ::tt::runtime::ttnn::runProgram(
+      meshDevice, executableHandle, programIndex, inputs);
+
+  return outputs;
+}
+
 } // namespace tt::runtime::ttnn
diff --git a/runtime/test/ttnn/test_subtract.cpp b/runtime/test/ttnn/test_subtract.cpp
index 00aebe20fb..995b95665c 100644
--- a/runtime/test/ttnn/test_subtract.cpp
+++ b/runtime/test/ttnn/test_subtract.cpp
@@ -21,12 +21,13 @@ TEST(TTNNSubtract, Equal) {
   const char *fbPath = std::getenv("TTMLIR_SUBTRACT_FB_PATH");
   assert(fbPath && "Path to subtract flatbuffer must be provided");
   ::tt::runtime::Binary fbb = ::tt::runtime::Binary::loadFromPath(fbPath);
-  EXPECT_EQ(fbb.getFileIdentifier(), "TTNN");
+  ASSERT_EQ(fbb.getFileIdentifier(), "TTNN");
   ::tt::runtime::setCompatibleRuntime(fbb);
   std::vector<::tt::runtime::TensorDesc> inputDescs = fbb.getProgramInputs(0);
+  assert(inputDescs.size() == 2);
   std::vector<::tt::runtime::TensorDesc> outputDescs = fbb.getProgramOutputs(0);
-  std::vector<::tt::runtime::Tensor> inputTensors, outputTensors;
-
+  assert(outputDescs.size() == 1);
+  std::vector<::tt::runtime::Tensor> inputTensors;
   std::uint32_t tensorSize = inputDescs[0].itemsize;
   for (const int dim : inputDescs[0].shape) {
     tensorSize *= dim;
@@ -38,26 +39,27 @@ TEST(TTNNSubtract, Equal) {
     std::memset(data.get(), 1, tensorSize);
     inputTensors.emplace_back(::tt::runtime::createTensor(data, desc));
   }
-  for (const auto &desc : outputDescs) {
-    std::shared_ptr<void> data =
-        ::tt::runtime::utils::malloc_shared(tensorSize);
-    // Set to wrong value on purpose here
-    std::memset(data.get(), 1, tensorSize);
-    outputTensors.emplace_back(::tt::runtime::createTensor(data, desc));
-  }
+
+  std::shared_ptr<void> outputDataPtr =
+      ::tt::runtime::utils::malloc_shared(tensorSize);
+  // Set to wrong value on purpose here
+  std::memset(outputDataPtr.get(), 1, tensorSize);
+  ::tt::runtime::Tensor outputTensor =
+      ::tt::runtime::createTensor(outputDataPtr, outputDescs[0]);
 
   size_t numDevices = ::tt::runtime::getNumAvailableDevices();
   std::vector<int> deviceIds(numDevices);
   std::iota(deviceIds.begin(), deviceIds.end(), 0);
-  auto device = ::tt::runtime::openDevice(deviceIds);
-  auto ev = ::tt::runtime::submit(device, fbb, 0, inputTensors, outputTensors);
+  auto device = ::tt::runtime::openDevice({deviceIds[0]});
+  std::vector<::tt::runtime::Tensor> output =
+      ::tt::runtime::submit(device, fbb, 0, inputTensors);
   ::tt::runtime::closeDevice(device);
-
+  assert(output.size() == 1);
   std::shared_ptr<void> expected =
       ::tt::runtime::utils::malloc_shared(tensorSize);
   std::memset(expected.get(), 0, tensorSize);
-  for (const auto &outputTensor : outputTensors) {
-    EXPECT_EQ(std::memcmp(outputTensor.data.get(), expected.get(), tensorSize),
-              0);
-  }
+  ::tt::runtime::Tensor submitOutput = output[0];
+  ASSERT_NE(std::memcmp(outputDataPtr.get(), expected.get(), tensorSize), 0);
+  ::tt::runtime::memcpy(outputTensor, submitOutput);
+  ASSERT_EQ(std::memcmp(outputDataPtr.get(), expected.get(), tensorSize), 0);
 }
diff --git a/runtime/tools/python/setup.py b/runtime/tools/python/setup.py
index ddbe3da9fe..50d6b18392 100644
--- a/runtime/tools/python/setup.py
+++ b/runtime/tools/python/setup.py
@@ -59,7 +59,12 @@
 linklibs = ["TTBinary"]
 if enable_ttnn:
     runlibs += ["_ttnn.so"]
-    linklibs += ["TTRuntimeTTNN", "TTRuntimeTTNNOps", ":_ttnn.so"]
+    linklibs += [
+        "TTRuntimeTTNN",
+        "TTRuntimeTTNNOps",
+        "TTRuntimeTTNNHelpers",
+        ":_ttnn.so",
+    ]
 
 if enable_ttmetal:
     runlibs += ["libtt_metal.so"]
@@ -231,6 +236,7 @@ def package_files(directory):
                 f"{src_dir}/build/runtime/lib/common",
                 f"{src_dir}/build/runtime/lib/ttnn",
                 f"{src_dir}/build/runtime/lib/ttnn/operations",
+                f"{src_dir}/build/runtime/lib/ttnn/include",
                 f"{src_dir}/build/runtime/lib/ttmetal",
                 f"{toolchain}/lib",
                 f"{src_dir}/build/runtime/tools/python/ttrt/runtime",
diff --git a/runtime/tools/python/test/test_api.py b/runtime/tools/python/test/test_api.py
new file mode 100644
index 0000000000..a0ddb087b8
--- /dev/null
+++ b/runtime/tools/python/test/test_api.py
@@ -0,0 +1,150 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import pytest
+import ttrt
+import torch
+from ttrt.common.run import Run
+from ttrt.common.query import Query
+from ttrt.common.util import *
+from ttrt.common.api import API
+from util import *
+
+
+def log_debug(logger, message, prefix="------", suffix=""):
+    logger.debug(f"{prefix}{message}{suffix}")
+
+
+class TestRunner:
+    def __init__(self, logger=None):
+        self.artifacts_dir = f"{os.getcwd()}/ttrt-artifacts"
+        self.logger = logger if logger is not None else Logger()
+        self.file_manager = FileManager(self.logger)
+        self.results = Results(self.logger, self.file_manager)
+        self.artifacts = Artifacts(
+            self.logger, self.file_manager, artifacts_folder_path=self.artifacts_dir
+        )
+        self.query = Query({"--quiet": True}, self.logger, self.artifacts)
+        self.query()
+        self.results_path = None
+        self.results = None
+
+    def initialize(self, binary_path, results_path):
+        self.binary_path = binary_path
+        self.binary = Binary(self.logger, self.file_manager, binary_path)
+        self.results_path = results_path
+        self.results = Results(self.logger, self.file_manager)
+
+    def teardown(self):
+        self.binary_path = None
+        self.binary = None
+        self.results_path = None
+        self.results = None
+
+    def create_and_save_result(
+        self, status: str, exception: str = "", program_index: int = 0
+    ):
+        test_result = {
+            "file_path": self.binary_path,
+            "result": status,
+            "exception": exception,
+            "log_file": self.logger.file_name,
+            "artifacts": self.artifacts.artifacts_folder_path,
+            "program_index": program_index,
+        }
+        self.results.add_result(test_result)
+        self.results.save_results(self.results_path)
+
+    def check_constraints(self):
+        self.binary.check_version()
+        self.binary.check_system_desc(self.query)
+
+    def test_to_device_to_host(self, shape, dtype):
+        tensor = torch.randn(shape, dtype=dtype)
+        result_tensor = torch.zeros(shape, dtype=dtype)
+        runtime_input_tensor = ttrt.runtime.create_tensor(
+            tensor.data_ptr(),
+            list(tensor.shape),
+            list(tensor.stride()),
+            tensor.element_size(),
+            Binary.Program.to_data_type(tensor.dtype),
+        )
+        device = ttrt.runtime.open_device([self.query.device_ids[0]])
+        device_tensor = ttrt.runtime.to_device(runtime_input_tensor, device)
+        host_tensor = ttrt.runtime.to_host(device_tensor)
+        ttrt.runtime.close_device(device)
+        ttrt.runtime.memcpy(result_tensor.data_ptr(), host_tensor)
+        assert torch.allclose(tensor, result_tensor)
+
+    def test_host_layout_conversion(self):
+        program_indices = list(range(self.binary.get_num_programs()))
+        for program_index in program_indices:
+            program = self.binary.get_program(program_index)
+            program.populate_inputs(Run.TorchInitializer.get_initilizer("randn"))
+            for input_index, input_tensor in enumerate(program.input_tensors):
+                layout = ttrt.runtime.get_layout(
+                    self.binary.fbb, program_index, input_index
+                )
+                input_tensor_runtime = ttrt.runtime.create_tensor(
+                    input_tensor.data_ptr(),
+                    list(input_tensor.shape),
+                    list(input_tensor.stride()),
+                    input_tensor.element_size(),
+                    Binary.Program.to_data_type(input_tensor.dtype),
+                )
+                converted_tensor = ttrt.runtime.to_layout(input_tensor_runtime, layout)
+                result_tensor = torch.zeros(
+                    input_tensor.shape, dtype=input_tensor.dtype
+                )
+                ttrt.runtime.memcpy(result_tensor.data_ptr(), converted_tensor)
+                assert torch.allclose(input_tensor, result_tensor)
+
+    # TODO: Add test for to_device with layout and on-device layout conversion
+    # once compiler supports non-host input tensor layouts
+
+
+class TestContext:
+    def __init__(self, test_runner: TestRunner):
+        self.test_runner = test_runner
+
+    def __enter__(self):
+        return self.test_runner
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        try:
+            if exc_type is None:
+                self.test_runner.create_and_save_result("pass")
+            else:
+                self.test_runner.create_and_save_result("error", str(exc_val))
+        finally:
+            self.test_runner.teardown()
+        return False
+
+
+@pytest.fixture(scope="module")
+def test_runner():
+    API.initialize_apis()
+    runner = TestRunner()
+    yield runner
+
+
+@pytest.mark.parametrize("shape", [(64, 128)])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+def test_to_device_to_host(test_runner: TestRunner, shape, dtype, request):
+    results_path = f"ttrt-results/{request.node.name}.json"
+    binary_path = BINARY_FILE_PATH
+    test_runner.initialize(binary_path, results_path)
+    with TestContext(test_runner):
+        test_runner.check_constraints()
+        test_runner.test_to_device_to_host(shape, dtype)
+
+
+def test_host_layout_conversion(test_runner: TestRunner, request):
+    results_path = f"ttrt-results/{request.node.name}.json"
+    binary_path = BINARY_FILE_PATH
+    test_runner.initialize(binary_path, results_path)
+    with TestContext(test_runner):
+        test_runner.check_constraints()
+        test_runner.test_host_layout_conversion()
diff --git a/runtime/tools/python/ttrt/common/run.py b/runtime/tools/python/ttrt/common/run.py
index c2ae10ac9a..ff77de75ed 100644
--- a/runtime/tools/python/ttrt/common/run.py
+++ b/runtime/tools/python/ttrt/common/run.py
@@ -380,6 +380,7 @@ def _execute(binaries):
             self.logging.debug(f"setting torch manual seed={self['--seed']}")
             torch.manual_seed(self["--seed"])
             ttrt.runtime.set_compatible_runtime(binaries[0].fbb)
+            current_runtime = ttrt.runtime.get_current_runtime()
             self.logging.debug(f"opening devices={self.query.device_ids}")
             device = ttrt.runtime.open_device(self.query.device_ids)
 
@@ -459,20 +460,42 @@ def _execute(binaries):
                                 self.logging.debug(
                                     f"starting loop={loop+1}/{self['--loops']} for binary={bin.file_path}"
                                 )
+                                if (
+                                    current_runtime
+                                    == ttrt.runtime.DeviceRuntime.TTMetal
+                                ):
+                                    event = ttrt.runtime.submit(
+                                        device,
+                                        bin.fbb,
+                                        program_index,
+                                        total_inputs[loop],
+                                        total_outputs[loop],
+                                    )
 
-                                event = ttrt.runtime.submit(
-                                    device,
-                                    bin.fbb,
-                                    program_index,
-                                    total_inputs[loop],
-                                    total_outputs[loop],
-                                )
+                                elif current_runtime == ttrt.runtime.DeviceRuntime.TTNN:
+                                    runtime_outputs = ttrt.runtime.submit(
+                                        device,
+                                        bin.fbb,
+                                        program_index,
+                                        total_inputs[loop],
+                                    )
+                                    for i, runtime_output_tensor in enumerate(
+                                        runtime_outputs
+                                    ):
+                                        ttrt.runtime.memcpy(
+                                            total_outputs[loop][i],
+                                            runtime_output_tensor,
+                                        )
+                                        ttrt.runtime.deallocate_tensor(
+                                            runtime_output_tensor, force=True
+                                        )
 
                                 self.logging.debug(
                                     f"finished loop={loop+1}/{self['--loops']} for binary={bin.file_path}"
                                 )
 
-                            ttrt.runtime.wait(event)
+                            if event is not None:
+                                ttrt.runtime.wait(event)
 
                             if self["--identity"]:
                                 self.logging.debug(
diff --git a/runtime/tools/python/ttrt/runtime/__init__.py b/runtime/tools/python/ttrt/runtime/__init__.py
index 642b0401f5..dc8ac5c35c 100644
--- a/runtime/tools/python/ttrt/runtime/__init__.py
+++ b/runtime/tools/python/ttrt/runtime/__init__.py
@@ -20,8 +20,14 @@
         create_tensor,
         create_multi_device_tensor,
         wait,
+        to_host,
+        to_device,
+        to_layout,
+        get_layout,
         get_op_output_tensor,
         get_op_debug_str,
+        memcpy,
+        deallocate_tensor,
         WorkaroundEnv,
     )
 except ModuleNotFoundError:
diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp
index dfc4a68201..e64c752ba5 100644
--- a/runtime/tools/python/ttrt/runtime/module.cpp
+++ b/runtime/tools/python/ttrt/runtime/module.cpp
@@ -22,6 +22,7 @@ PYBIND11_MODULE(_C, m) {
       .def("deallocate_buffers", &tt::runtime::detail::deallocateBuffers);
   py::class_<tt::runtime::Event>(m, "Event");
   py::class_<tt::runtime::Tensor>(m, "Tensor");
+  py::class_<tt::runtime::Layout>(m, "Layout");
   py::class_<tt::runtime::OpContext>(m, "OpContext");
   py::class_<tt::runtime::CallbackContext>(m, "CallbackContext");
   py::enum_<::tt::target::DataType>(m, "DataType")
@@ -69,8 +70,8 @@ PYBIND11_MODULE(_C, m) {
          ::tt::target::DataType dataType,
          std::unordered_map<std::string, std::string> const &strategy) {
         std::vector<std::shared_ptr<void>> data;
-        data.resize(ptrs.size());
-        std::transform(ptrs.begin(), ptrs.end(), data.begin(),
+        data.reserve(ptrs.size());
+        std::transform(ptrs.begin(), ptrs.end(), std::back_inserter(data),
                        [](std::uintptr_t ptr) {
                          return ::tt::runtime::utils::unsafe_borrow_shared(
                              reinterpret_cast<void *>(ptr));
@@ -85,9 +86,53 @@ PYBIND11_MODULE(_C, m) {
         py::arg("num_hw_cqs") = size_t{1},
         "Open a mesh of devices for execution");
   m.def("close_device", &tt::runtime::closeDevice, "Close a mesh device");
-  m.def("submit", &tt::runtime::submit, py::arg("device"),
-        py::arg("executable"), py::arg("program_index"), py::arg("inputs"),
-        py::arg("outputs"), "Submit a binary for execution");
+  m.def("to_host", &tt::runtime::toHost, py::arg("tensor"),
+        py::arg("utilize") = false, "Copy the tensor to the host");
+  m.def(
+      "to_device",
+      [](const ::tt::runtime::Tensor tensor,
+         ::tt::runtime::Device device) -> ::tt::runtime::Tensor {
+        return ::tt::runtime::toDevice(tensor, device);
+      },
+      py::arg("tensor"), py::arg("device"), "Copy the tensor to the device");
+  m.def(
+      "to_device",
+      [](::tt::runtime::Tensor tensor, ::tt::runtime::Device device,
+         ::tt::runtime::Layout layout) -> ::tt::runtime::Tensor {
+        return ::tt::runtime::toDevice(tensor, device, layout);
+      },
+      py::arg("tensor"), py::arg("device"), py::arg("layout"),
+      "Copy the tensor to the device with the specified layout");
+  m.def("to_layout", &tt::runtime::toLayout, py::arg("tensor"),
+        py::arg("layout"),
+        "Create a copy of the tensor with the specified layout");
+  m.def("get_layout", &tt::runtime::getLayout, py::arg("executable"),
+        py::arg("program_index"), py::arg("input_index"),
+        "Get the layout of the input tensor");
+  m.def(
+      "submit",
+      [](::tt::runtime::Device device, ::tt::runtime::Binary executable,
+         std::uint32_t programIndex,
+         const std::vector<::tt::runtime::Tensor> &inputs)
+          -> std::vector<::tt::runtime::Tensor> {
+        return ::tt::runtime::submit(device, executable, programIndex, inputs);
+      },
+      py::arg("device"), py::arg("executable"), py::arg("program_index"),
+      py::arg("inputs"),
+      "Submit a ttnn binary for execution, returns a vector of output tensors");
+  m.def(
+      "submit",
+      [](::tt::runtime::Device device, ::tt::runtime::Binary executable,
+         std::uint32_t programIndex,
+         const std::vector<::tt::runtime::Tensor> &inputs,
+         const std::vector<::tt::runtime::Tensor> &outputs)
+          -> ::tt::runtime::Event {
+        return ::tt::runtime::submit(device, executable, programIndex, inputs,
+                                     outputs);
+      },
+      py::arg("device"), py::arg("executable"), py::arg("program_index"),
+      py::arg("inputs"), py::arg("outputs"),
+      "Submit a ttmetal binary for execution. returns event wrapper");
   m.def("wait", &tt::runtime::wait, py::arg("event"));
   m.def(
       "get_op_output_tensor",
@@ -100,7 +145,22 @@ PYBIND11_MODULE(_C, m) {
       "Get the input tensor of the op");
   m.def("get_op_debug_str", &tt::runtime::getOpDebugString,
         "Get the debug string of the op");
-
+  m.def(
+      "memcpy",
+      [](std::uintptr_t dst, ::tt::runtime::Tensor src) {
+        ::tt::runtime::memcpy(reinterpret_cast<void *>(dst), src);
+      },
+      py::arg("dst"), py::arg("src"),
+      "Copy the data from src tensor to dst memory location");
+  m.def(
+      "memcpy",
+      [](::tt::runtime::Tensor dst, ::tt::runtime::Tensor src) {
+        ::tt::runtime::memcpy(dst, src);
+      },
+      py::arg("dst"), py::arg("src"),
+      "Copy the data from src tensor to dst tensor");
+  m.def("deallocate_tensor", &tt::runtime::deallocateTensor, py::arg("tensor"),
+        py::arg("force") = false, "Deallocate the tensor memory");
   py::class_<tt::runtime::debug::Env>(m, "DebugEnv")
       .def_static("get", &tt::runtime::debug::Env::get)
       .def("__str__", [](const tt::runtime::debug::Env &env) {
diff --git a/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir b/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir
index e819e68f4b..3089da6692 100644
--- a/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir
+++ b/test/ttmlir/Dialect/TTNN/eltwise/unary/isfinite/simple_isfinite.mlir
@@ -1,15 +1,15 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 module attributes {} {
-  func.func @is_finite(%arg0: tensor<64x128xf32>) -> tensor<64x128xbf16> {
+  func.func @is_finite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
     // CHECK: %[[C:.*]] = "ttnn.empty"
     // CHECK-SAME: [[TENSOR:tensor<64x128xbf16,]]
     %0 = tensor.empty() : tensor<64x128xbf16>
     // CHECK: %[[C:.*]] = "ttnn.isfinite"
-    // CHECK-SAME: tensor<64x128xf32,
+    // CHECK-SAME: tensor<64x128xbf16,
     // CHECK-SAME: [[TENSOR]]
     // CHECK-SAME: -> [[TENSOR]]
-    %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+    %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
     return %1 : tensor<64x128xbf16>
   }
 }
diff --git a/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir b/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir
index 04b9f1fefb..35682c8c0b 100644
--- a/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir
+++ b/test/ttmlir/Silicon/StableHLO/Unary/isfinite_op.mlir
@@ -7,14 +7,14 @@
 // RUN: FileCheck --input-file=%t.mlir %s
 
 module @jit_eltwise_isfinite attributes {} {
-  func.func public @test_isfinite(%arg0: tensor<64x128xf32>) -> tensor<64x128xi1> {
+  func.func public @test_isfinite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xi1> {
     // CHECK-LABEL: func.func public @test_isfinite
     // CHECK: ttnn.empty
     // CHECK: ttnn.isfinite
-    // CHECK-SAME: tensor<64x128xf32,
+    // CHECK-SAME: tensor<64x128xbf16,
     // CHECK-SAME: tensor<64x128xbf16,
     // CHECK-SAME: -> tensor<64x128xbf16,
-    %0 = stablehlo.is_finite %arg0 : (tensor<64x128xf32>) -> tensor<64x128xi1>
+    %0 = stablehlo.is_finite %arg0 : (tensor<64x128xbf16>) -> tensor<64x128xi1>
     return %0 : tensor<64x128xi1>
   }
 }
diff --git a/test/ttmlir/Silicon/StableHLO/select_op.mlir b/test/ttmlir/Silicon/StableHLO/select_op.mlir
index 23b7182ce0..1cdc5e9d05 100644
--- a/test/ttmlir/Silicon/StableHLO/select_op.mlir
+++ b/test/ttmlir/Silicon/StableHLO/select_op.mlir
@@ -6,23 +6,23 @@
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 
 module @jit_eltwise_select attributes {} {
-  func.func public @test_select(%arg0: tensor<64x128xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  func.func public @test_select(%arg0: tensor<64x128xbf16>, %arg1: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
     // CHECK-LABEL: func.func public @test_select
     // CHECK: tensor.empty
     // CHECK: [[EQ:{{0-9}}+]] = "ttnn.eq"
-    // CHECK-SAME: tensor<64x128xf32
-    // CHECK-SAME: tensor<64x128xf32
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<64x128xbf16
     // CHECK-SAME: tensor<64x128xbf16
     // CHECK-SAME: -> tensor<64x128xbf16
-    %0 = stablehlo.compare EQ, %arg0, %arg1 : (tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xi1>
+    %0 = stablehlo.compare EQ, %arg0, %arg1 : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xi1>
     // CHECK: ttnn.where
     // CHECK-SAME: [[EQ]]
     // CHECK-SAME: tensor<64x128xbf16
-    // CHECK-SAME: tensor<64x128xf32
-    // CHECK-SAME: tensor<64x128xf32
-    // CHECK-SAME: tensor<64x128xf32
-    // CHECK-SAME: -> tensor<64x128xf32
-    %1 = stablehlo.select %0, %arg0, %arg1 : (tensor<64x128xi1>, tensor<64x128xf32>, tensor<64x128xf32>) -> tensor<64x128xf32>
-    return %1 : tensor<64x128xf32>
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: tensor<64x128xbf16
+    // CHECK-SAME: -> tensor<64x128xbf16
+    %1 = stablehlo.select %0, %arg0, %arg1 : (tensor<64x128xi1>, tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+    return %1 : tensor<64x128xbf16>
   }
 }
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir
index ce0146be40..f1489a5ebd 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_isfinite.mlir
@@ -4,14 +4,14 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 
-func.func @is_finite(%arg0: tensor<64x128xf32>) -> tensor<64x128xbf16> {
+func.func @is_finite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
   // CHECK: %[[C:.*]] = "ttnn.empty"
   // CHECK-SAME: [[TENSOR:tensor<64x128xbf16,]]
   %0 = tensor.empty() : tensor<64x128xbf16>
   // CHECK: %[[C:.*]] = "ttnn.isfinite"
-  // CHECK-SAME: tensor<64x128xf32,
+  // CHECK-SAME: tensor<64x128xbf16,
   // CHECK-SAME: [[TENSOR]]
   // CHECK-SAME: -> [[TENSOR]]
-  %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+  %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
   return %1 : tensor<64x128xbf16>
 }
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir
deleted file mode 100644
index 79de8c062d..0000000000
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_le.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
-// RUN: FileCheck %s --input-file=%t.mlir
-// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
-
-#any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
-#any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
-
-module attributes {} {
-  func.func @less_equal(%arg0: tensor<13x31xf32>, %arg1: tensor<13x31xf32>) -> tensor<13x31xf32> {
-    // CHECK: %[[C:.*]] = "ttnn.empty
-    // CHECK-SAME: [[TENSOR:tensor<13x31xf32,]]
-    %0 = tensor.empty() : tensor<13x31xf32>
-    // CHECK: %[[C:.*]] = "ttnn.le"
-    // CHECK-SAME: [[TENSOR]]
-    // CHECK-SAME: [[TENSOR]]
-    // CHECK-SAME: [[TENSOR]]
-    // CHECK-SAME: -> [[TENSOR]]
-    %1 = "ttir.le"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<13x31xf32>, tensor<13x31xf32>, tensor<13x31xf32>) -> tensor<13x31xf32>
-    return %1 : tensor<13x31xf32>
-  }
-}
diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir
index 3bed0528c6..647f94e61e 100644
--- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir
+++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_where.mlir
@@ -4,13 +4,13 @@
 #any_device = #tt.operand_constraint<dram|l1|scalar|tile|any_device|any_device_tile>
 #any_device_tile = #tt.operand_constraint<dram|l1|tile|any_device_tile>
 
-func.func @test_where(%arg0: tensor<13x37xf32>, %arg1: tensor<13x37xf32>) -> tensor<13x37xf32> {
+func.func @test_where(%arg0: tensor<13x37xbf16>, %arg1: tensor<13x37xbf16>) -> tensor<13x37xbf16> {
   %0 = tensor.empty() : tensor<13x37xbf16>
-  %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
-  %2 = tensor.empty() : tensor<13x37xf32>
-  %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xf32>) -> tensor<13x37xf32>
+  %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
+  %2 = tensor.empty() : tensor<13x37xbf16>
+  %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
   // CHECK: %[[EMPTY:.*]] = "ttnn.empty"{{.*}}
   // CHECK: %[[VAL1:[0-9]+]] = "ttnn.eq"(%{{[0-9]+}}, %{{[0-9]+}}, %[[EMPTY]])
   // CHECK: %{{[0-9]+}} = "ttnn.where"(%[[VAL1]], %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}})
-  return %3 : tensor<13x37xf32>
+  return %3 : tensor<13x37xbf16>
 }
diff --git a/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir b/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir
index 1d88725d1d..db02d1ba13 100644
--- a/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_broadcast.mlir
@@ -1,6 +1,7 @@
 // RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
+// UNSUPPORTED: true
 #any_device = #tt.operand_constraint<dram|l1|tile|any_device|any_device_tile>
 
 func.func public @broadcast() -> (tensor<32xf32>) {
diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
index 976f2867db..0bb337e7f9 100644
--- a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
+++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir
@@ -65,15 +65,15 @@ func.func @floor(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
   return %1 : tensor<64x128xf32>
 }
 
-func.func @is_finite(%arg0: tensor<64x128xf32>) -> tensor<64x128xbf16> {
+func.func @is_finite(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> {
   // CHECK: %[[C:.*]] = "ttnn.empty"
   // CHECK-SAME: [[TENSOR:tensor<64x128xbf16,]]
   %0 = tensor.empty() : tensor<64x128xbf16>
   // CHECK: %[[C:.*]] = "ttnn.isfinite"
-  // CHECK-SAME: tensor<64x128xf32,
+  // CHECK-SAME: tensor<64x128xbf16,
   // CHECK-SAME: [[TENSOR]]
   // CHECK-SAME: -> [[TENSOR]]
-  %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xf32>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
+  %1 = "ttir.isfinite"(%arg0, %0) <{operandSegmentSizes = array<i32: 1, 1>, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16>
   return %1 : tensor<64x128xbf16>
 }
 
@@ -278,15 +278,15 @@ func.func @get_dimension_size(%arg0: tensor<13x21x3xf32>) -> tensor<1xi32> {
   // CHECK: return [[VAL]] : tensor<1xi32, {{.*}}>
 }
 
-func.func @test_where(%arg0: tensor<13x37xf32>, %arg1: tensor<13x37xf32>) -> tensor<13x37xf32> {
+func.func @test_where(%arg0: tensor<13x37xbf16>, %arg1: tensor<13x37xbf16>) -> tensor<13x37xbf16> {
   %0 = tensor.empty() : tensor<13x37xbf16>
-  %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
-  %2 = tensor.empty() : tensor<13x37xf32>
-  %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xf32>, tensor<13x37xf32>, tensor<13x37xf32>) -> tensor<13x37xf32>
+  %1 = "ttir.eq"(%arg0, %arg1, %0) <{operandSegmentSizes = array<i32: 2, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
+  %2 = tensor.empty() : tensor<13x37xbf16>
+  %3 = "ttir.where"(%1, %arg0, %arg1, %2) <{operandSegmentSizes = array<i32: 3, 1>, operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>, tensor<13x37xbf16>) -> tensor<13x37xbf16>
   // CHECK: %[[EMPTY:.*]] = "ttnn.empty"{{.*}}
   // CHECK: %[[VAL1:[0-9]+]] = "ttnn.eq"(%{{[0-9]+}}, %{{[0-9]+}}, %[[EMPTY]])
   // CHECK: %{{[0-9]+}} = "ttnn.where"(%[[VAL1]], %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}})
-  return %3 : tensor<13x37xf32>
+  return %3 : tensor<13x37xbf16>
 }
 
 func.func @gelu(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {