Refactor toMemoryConfigOp (#253)

* #92: Refactor toMemoryConfigOp * hard code tilize untilize code path until tile_shape properly implemented
tenstorrent · Aug 2, 2024 · dcb4b51 · dcb4b51
1 parent e791b09
commit dcb4b51
Show file tree

Hide file tree

Showing 6 changed files with 191 additions and 57 deletions.
diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
@@ -35,6 +35,7 @@
 #define FMT_HEADER_ONLY
 #include "ttnn/device.hpp"
 #include "ttnn/operations/binary.hpp"
+#include "ttnn/operations/copy.hpp"
 #include "ttnn/operations/core.hpp"
 #include "ttnn/operations/creation.hpp"
 #include "ttnn/operations/matmul.hpp"

diff --git a/runtime/lib/binary.cpp b/runtime/lib/binary.cpp
@@ -76,7 +76,7 @@ std::vector<TensorDesc> getProgramInputs(Flatbuffer binary,
                   input->desc()->shape()->end()};
     desc.stride = {input->desc()->layout()->stride()->begin(),
                    input->desc()->layout()->stride()->end()};
-    desc.itemsize = utils::dataTypeElementSize(
+    desc.itemsize = ::tt::runtime::utils::dataTypeElementSize(
         input->desc()->layout()->memory_desc()->data_type());
     desc.dataType = input->desc()->layout()->memory_desc()->data_type();
     inputs.push_back(desc);
@@ -94,7 +94,7 @@ std::vector<TensorDesc> getProgramOutputs(Flatbuffer binary,
                   output->desc()->shape()->end()};
     desc.stride = {output->desc()->layout()->stride()->begin(),
                    output->desc()->layout()->stride()->end()};
-    desc.itemsize = utils::dataTypeElementSize(
+    desc.itemsize = ::tt::runtime::utils::dataTypeElementSize(
         output->desc()->layout()->memory_desc()->data_type());
     desc.dataType = output->desc()->layout()->memory_desc()->data_type();
     outputs.push_back(desc);
@@ -210,7 +210,7 @@ Flatbuffer Flatbuffer::loadFromPath(char const *path) {
 
   std::streampos size = fbb.tellg();
   fbb.seekg(0, std::ios::beg);
-  auto buffer = utils::malloc_shared(size);
+  auto buffer = ::tt::runtime::utils::malloc_shared(size);
   fbb.read(static_cast<char *>(buffer.get()), size);
   return Flatbuffer(buffer);
 }

diff --git a/runtime/lib/ttnn/CMakeLists.txt b/runtime/lib/ttnn/CMakeLists.txt
@@ -3,6 +3,7 @@ add_library(TTRuntimeTTNN
   runtime.cpp
   program.cpp
 )
+target_compile_options(TTRuntimeTTNN PRIVATE -mavx -mavx2)
 target_include_directories(TTRuntimeTTNN PUBLIC
   ${PROJECT_SOURCE_DIR}/runtime/include
   ${PROJECT_BINARY_DIR}/include/ttmlir/Target/Common

diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp
@@ -8,6 +8,7 @@
 
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/runtime.h"
+#include "utils.h"
 
 #include "ttmlir/Target/TTNN/Target.h"
 #include "ttmlir/Version.h"
@@ -24,50 +25,158 @@
 // some reason a static_assert fails when this is called from within our
 // namespace.
 ttnn::Tensor tilize(ttnn::Tensor const &input) {
-  ttnn::Tensor unsqueezeTensor = ttnn::unsqueeze_to_4D(input);
-  return ttnn::to_layout(unsqueezeTensor, ttnn::TILE_LAYOUT, std::nullopt,
+  return ttnn::to_layout(input, ::ttnn::TILE_LAYOUT, std::nullopt, std::nullopt,
+                         (Device *)nullptr);
+}
+
+ttnn::Tensor untilize(ttnn::Tensor const &input) {
+  return ttnn::to_layout(input, ::ttnn::ROW_MAJOR_LAYOUT, std::nullopt,
                          std::nullopt, (Device *)nullptr);
 }
 
 namespace tt::runtime::ttnn {
+
+static ::ttnn::Tensor convertDataType(const ::ttnn::Tensor &input,
+                                      const ::ttnn::DataType &targetDataType) {
+  const ::ttnn::StorageType storageType = input.storage_type();
+  if (storageType == ::tt::tt_metal::StorageType::BORROWED) {
+    return ::ttnn::to_dtype(input, targetDataType);
+  } else if (storageType == ::tt::tt_metal::StorageType::DEVICE) {
+    if (input.get_layout() != ::ttnn::TILE_LAYOUT) {
+      // typecast op requires tilized tensor
+      ::ttnn::Tensor converted =
+          ::ttnn::typecast(::tilize(input), targetDataType);
+      // untilize and return
+      return ::untilize(converted);
+    }
+    return ::ttnn::typecast(input, targetDataType);
+  } else {
+    throw runtime_error("Unsupported storage type");
+  }
+}
+
+/* TODO: Blocked by issue #272, ideal flow is to determine tilize/untilize with
+ * tile_shape */
+static ::ttnn::Tensor
+updateLayoutAndDataType(const ::ttnn::Tensor &inputTensor,
+                        const ::ttnn::DataType targetDataType,
+                        const bool shouldTilize, const bool shouldUntilize) {
+  ::ttnn::Tensor outputTensor = inputTensor;
+  const bool shouldConvertDataType = inputTensor.get_dtype() != targetDataType;
+  // const int targetTileX = targetTileShape->x();
+  // const int targetTileY = targetTileShape->y();
+  // const bool shouldTilize =
+  //     targetTileX == 32 and targetTileY == 32 and
+  //     inputTensor.get_layout() == ::ttnn::ROW_MAJOR_LAYOUT;
+  // const bool shouldUntilize = (targetTileX != 32 or targetTileY != 32) and
+  //                             inputTensor.get_layout() ==
+  //                             ::ttnn::TILE_LAYOUT;
+  if (shouldTilize) {
+    outputTensor = ::tilize(outputTensor);
+  } else if (shouldUntilize) {
+    outputTensor = ::untilize(outputTensor);
+  }
+  if (shouldConvertDataType) {
+    outputTensor = convertDataType(outputTensor, targetDataType);
+  }
+  return outputTensor;
+}
+
+// TODO: right now hardcoding tilize/untilize, should determine with tile shape
+// blocked by issue #272
 static void
 run(::tt::target::ttnn::ToMemoryConfigOp const *op, ::ttnn::Device &device,
     std::unordered_map<std::uint32_t, ::ttnn::Tensor *> &liveTensors,
     std::list<::ttnn::Tensor> &tensorPool) {
-  if (op->out()->desc()->layout()->memory_desc()->memory_space() ==
-      ::tt::target::MemorySpace::System) {
-    auto &inputTensor = *liveTensors.at(op->in0()->global_id());
-    auto cpu = inputTensor.cpu();
-    ::ttnn::Tensor untilized;
-    if (op->out()->desc()->layout()->memory_desc()->data_type() ==
-        ::tt::target::DataType::Float32) {
-      untilized = ::tt::tt_metal::tensor_impl::to_layout<float>(
-          cpu, ::ttnn::ROW_MAJOR_LAYOUT);
-    } else if (op->out()->desc()->layout()->memory_desc()->data_type() ==
-               ::tt::target::DataType::BFloat16) {
-      untilized = ::tt::tt_metal::tensor_impl::to_layout<bfloat16>(
-          cpu, ::ttnn::ROW_MAJOR_LAYOUT);
-    } else {
-      throw std::runtime_error("Unsupported data type");
+  const ::ttnn::Tensor &inputTensor = *liveTensors.at(op->in0()->global_id());
+  assert(inputTensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED or
+         inputTensor.storage_type() == ::tt::tt_metal::StorageType::DEVICE);
+
+  const ::tt::target::Dim2d *targetTileShape =
+      op->out()->desc()->layout()->memory_desc()->tile_shape();
+  TT_FATAL(utils::isValidTileShape(targetTileShape),
+           "Invalid tile shape ({}, {})", targetTileShape->x(),
+           targetTileShape->y());
+
+  ::tt::target::DataType targetDataType =
+      op->out()->desc()->layout()->memory_desc()->data_type();
+  ::ttnn::DataType targetDataTypeTTNN = utils::toTTNNDataType(targetDataType);
+
+  const ::tt::target::MemorySpace targetMemorySpace =
+      op->out()->desc()->layout()->memory_desc()->memory_space();
+
+  switch (targetMemorySpace) {
+  case ::tt::target::MemorySpace::System:
+  case ::tt::target::MemorySpace::SystemMMIO: {
+    ::ttnn::Tensor result;
+    if (inputTensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED) {
+      result =
+          updateLayoutAndDataType(inputTensor, targetDataTypeTTNN, false, true);
+    } else if (inputTensor.storage_type() ==
+               ::tt::tt_metal::StorageType::DEVICE) {
+      result = updateLayoutAndDataType(inputTensor.cpu(), targetDataTypeTTNN,
+                                       false, true);
     }
-    auto &outputTensor = *liveTensors.at(op->out()->global_id());
-    void *src = ::tt::tt_metal::get_raw_host_data_ptr(untilized);
+    ::ttnn::Tensor &outputTensor = *liveTensors.at(op->out()->global_id());
+    void *src = ::tt::tt_metal::get_raw_host_data_ptr(result);
     void *dst = ::tt::tt_metal::get_raw_host_data_ptr(outputTensor);
-    std::uint32_t size = untilized.volume() * untilized.element_size();
+    std::uint32_t size = result.volume() * result.element_size();
     std::memcpy(dst, src, size);
-    return;
-  }
-  bool isL1 = op->in0()->desc()->layout()->memory_desc()->memory_space() ==
-              ::tt::target::MemorySpace::DeviceL1;
-  const auto memoryConfig =
-      isL1 ? ::ttnn::L1_MEMORY_CONFIG : ::ttnn::DRAM_MEMORY_CONFIG;
-  auto &inputTensor = *liveTensors.at(op->in0()->global_id());
-  ::ttnn::Tensor tilized = ::tilize(inputTensor);
-  auto deviceTensor = ::ttnn::to_device(tilized, &device, memoryConfig);
-  tensorPool.push_back(deviceTensor);
-  // auto [iter, inserted] =
-  liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
-  // assert(inserted && "Duplicate output tensor");
+    break;
+  }
+  case ::tt::target::MemorySpace::DeviceDRAM: {
+    ::tt::tt_metal::MemoryConfig memConfig = ::ttnn::DRAM_MEMORY_CONFIG;
+    if (inputTensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED) {
+      ::ttnn::Tensor result = inputTensor;
+      bool shouldTilize = true;
+      // device tilize requires BFLOAT16, if not then tilize on host
+      if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
+        result = ::tilize(result);
+        shouldTilize = false;
+      }
+      result = ::ttnn::to_device(result, &device, memConfig);
+      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                       false);
+      tensorPool.push_back(result);
+      liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    } else if (inputTensor.storage_type() ==
+               ::tt::tt_metal::StorageType::DEVICE) {
+      ::ttnn::Tensor result = updateLayoutAndDataType(
+          inputTensor, targetDataTypeTTNN, false, false);
+      result = ::ttnn::to_memory_config(result, memConfig, std::nullopt);
+      tensorPool.push_back(result);
+      liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    }
+    break;
+  }
+  // Currently similar to ::tt::target::MemorySpace::DeviceDRAM
+  // But will need it's own code path when we add support for sharding
+  case ::tt::target::MemorySpace::DeviceL1: {
+    ::tt::tt_metal::MemoryConfig memConfig = ::ttnn::L1_MEMORY_CONFIG;
+    if (inputTensor.storage_type() == ::tt::tt_metal::StorageType::BORROWED) {
+      ::ttnn::Tensor result = inputTensor;
+      bool shouldTilize = true;
+      // device tilize requires BFLOAT16, if not then tilize on host
+      if (result.get_dtype() != ::ttnn::DataType::BFLOAT16) {
+        result = ::tilize(result);
+        shouldTilize = false;
+      }
+      result = ::ttnn::to_device(result, &device, memConfig);
+      result = updateLayoutAndDataType(result, targetDataTypeTTNN, shouldTilize,
+                                       false);
+      tensorPool.push_back(result);
+      liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    } else if (inputTensor.storage_type() ==
+               ::tt::tt_metal::StorageType::DEVICE) {
+      ::ttnn::Tensor result = updateLayoutAndDataType(
+          inputTensor, targetDataTypeTTNN, false, false);
+      result = ::ttnn::to_memory_config(result, memConfig, std::nullopt);
+      tensorPool.push_back(result);
+      liveTensors.try_emplace(op->out()->global_id(), &tensorPool.back());
+    }
+    break;
+  }
+  }
 }
 
 static void

diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp
@@ -5,6 +5,7 @@
 #include "tt/runtime/runtime.h"
 #include "tt/runtime/detail/ttnn.h"
 #include "tt/runtime/utils.h"
+#include "utils.h"
 
 #include "ttmlir/Target/TTNN/Target.h"
 #include "ttmlir/Version.h"
@@ -71,7 +72,7 @@ std::pair<SystemDesc, DeviceIds> getCurrentSystemDesc() {
   }
   uint8_t *buf = fbb.GetBufferPointer();
   auto size = fbb.GetSize();
-  auto handle = utils::malloc_shared(size);
+  auto handle = ::tt::runtime::utils::malloc_shared(size);
   std::memcpy(handle.get(), buf, size);
   ::ttnn::close_device(device);
   return std::make_pair(SystemDesc(handle), chipIds);
@@ -104,33 +105,14 @@ static BorrowedStorage createStorage(void *ptr, std::uint32_t numElements,
   }
 }
 
-static ::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) {
-  switch (dataType) {
-  case ::tt::target::DataType::Float32:
-    return ::ttnn::DataType::FLOAT32;
-  // case ::tt::target::DataType::Float16:
-  //   return ::ttnn::DataType::FLOAT16;
-  case ::tt::target::DataType::BFloat16:
-    return ::ttnn::DataType::BFLOAT16;
-  case ::tt::target::DataType::UInt32:
-    return ::ttnn::DataType::UINT32;
-  case ::tt::target::DataType::UInt16:
-    return ::ttnn::DataType::UINT16;
-  // case ::tt::target::DataType::UInt8:
-  //   return ::ttnn::DataType::UINT8;
-  default:
-    throw std::runtime_error("Unsupported data type");
-  }
-}
-
 Tensor createTensor(std::shared_ptr<void> data,
                     std::vector<std::uint32_t> const &shape,
                     std::vector<std::uint32_t> const &stride,
                     std::uint32_t itemsize, ::tt::target::DataType dataType) {
   std::uint32_t numElements = shape[0] * stride[0];
   auto tensor = std::make_shared<::ttnn::Tensor>(
       createStorage(data.get(), numElements, dataType), shape,
-      toTTNNDataType(dataType), ::ttnn::Layout::ROW_MAJOR);
+      utils::toTTNNDataType(dataType), ::ttnn::Layout::ROW_MAJOR);
   return Tensor(tensor, data);
 }
 

diff --git a/runtime/lib/ttnn/utils.h b/runtime/lib/ttnn/utils.h
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTNN_RUNTIME_UTILS_H
+#define TTNN_RUNTIME_UTILS_H
+
+#include "ttmlir/Target/TTNN/Target.h"
+#include "ttnn/types.hpp"
+
+namespace tt::runtime::ttnn::utils {
+
+inline bool isValidTileShape(const ::tt::target::Dim2d *shape) {
+  return (shape->x() == 0 and shape->y() == 0) or
+         (shape->x() == 1 and shape->y() == 1) or
+         (shape->x() == 32 and shape->y() == 32);
+}
+
+inline ::ttnn::DataType toTTNNDataType(::tt::target::DataType dataType) {
+  switch (dataType) {
+  case ::tt::target::DataType::Float32:
+    return ::ttnn::DataType::FLOAT32;
+  case ::tt::target::DataType::BFloat16:
+    return ::ttnn::DataType::BFLOAT16;
+  case ::tt::target::DataType::BFP_BFloat8:
+    return ::ttnn::DataType::BFLOAT8_B;
+  case ::tt::target::DataType::BFP_BFloat4:
+    return ::ttnn::DataType::BFLOAT4_B;
+  case ::tt::target::DataType::UInt32:
+    return ::ttnn::DataType::UINT32;
+  case ::tt::target::DataType::UInt16:
+    return ::ttnn::DataType::UINT16;
+
+  default:
+    throw std::runtime_error("Unsupported data type");
+  }
+}
+
+} // namespace tt::runtime::ttnn::utils
+
+#endif