Runtime support for multi-chip tensors/ops, including creation and

execution. Updated program context to use parent/sub mesh model where sub-meshes won't close devices if parent is still alive.
tenstorrent · Nov 6, 2024 · fce58b3 · fce58b3
1 parent ae93524
commit fce58b3
Show file tree

Hide file tree

Showing 29 changed files with 610 additions and 201 deletions.
diff --git a/include/ttmlir/Target/Common/Target.h b/include/ttmlir/Target/Common/Target.h
@@ -0,0 +1,17 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef TTMLIR_TARGET_COMMON_TARGET_H
+#define TTMLIR_TARGET_COMMON_TARGET_H
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcovered-switch-default"
+
+#include "ttmlir/Target/Common/system_desc_generated.h"
+#include "ttmlir/Target/Common/types_generated.h"
+#include "ttmlir/Target/Common/version_generated.h"
+
+#pragma clang diagnostic pop
+
+#endif
diff --git a/include/ttmlir/Target/Common/types.fbs b/include/ttmlir/Target/Common/types.fbs
@@ -85,6 +85,33 @@ table MemoryConfigDesc {
   shard_spec: ShardSpec;
 }
 
+table ReplicateTensor {
+  replication_factor: uint32;
+}
+
+table ShardTensor {
+  shard_dim: uint32;
+}
+
+table ShardTensor2D {
+  shard_mesh: Dim2d;
+}
+
+table AllGatherTensor {
+
+}
+
+union DistributedTensorConfig {
+  ReplicateTensor,
+  ShardTensor,
+  ShardTensor2D,
+  AllGatherTensor
+}
+
+table DistributionStrategy {
+  strategy: DistributedTensorConfig;
+}
+
 table MemoryDesc {
   shape: [int];
   tile_shape: Dim2d;
@@ -99,6 +126,7 @@ table LayoutDesc {
   oob_val: OOBVal;
   core_range_set: [Dim2dRange];
   memory_desc: MemoryDesc;
+  strategy: DistributionStrategy;
 }
 
 table TensorDesc {

diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs
@@ -46,14 +46,18 @@ table EmptyOp {
   shape: [int64];
   dtype: DataType;
   layout: TensorLayout;
+  num_shards: uint32;
   device: tt.target.DeviceRef;         // optional
   memcfg: tt.target.MemoryConfigDesc;  // optional
+  strategy: tt.target.DistributionStrategy;
   out: tt.target.TensorRef;
 }
 
 table FullOp {
   device: tt.target.DeviceRef;
   fill_value: float;
+  num_shards: uint32;
+  strategy: tt.target.DistributionStrategy;
   out: tt.target.TensorRef;
 }
 

diff --git a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
@@ -424,11 +424,18 @@ layoutAttrToFlatbuffer(FlatbufferObjectCache &cache, Attribute attr,
   std::vector<int32_t> stride(strideInt64.begin(), strideInt64.end());
   auto coreRangeSet =
       toFlatbuffer(cache, layoutAttr.getGrid(), deviceAttr.getWorkerGrid());
+  ::tt::target::DistributedTensorConfig distributionType =
+      ::tt::target::DistributedTensorConfig::NONE;
+  ::flatbuffers::Offset<void> distribution = 0;
+  flatbuffers::Offset<::tt::target::DistributionStrategy> strategy =
+      ::tt::target::CreateDistributionStrategy(*cache.fbb, distributionType,
+                                               distribution);
   return ::tt::target::CreateLayoutDescDirect(
       *cache.fbb, &stride, toFlatbuffer(cache, layoutAttr.getOobVal()),
       &coreRangeSet,
       cache.getOrCreate(layoutAttr.getMemref(), memrefAttrToFlatbuffer,
-                        layoutAttr.getMemLayout()));
+                        layoutAttr.getMemLayout()),
+      strategy);
 }
 
 inline flatbuffers::Offset<::tt::target::TensorDesc>

diff --git a/lib/Dialect/TT/IR/TTOpsTypes.cpp b/lib/Dialect/TT/IR/TTOpsTypes.cpp
@@ -12,7 +12,7 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "ttmlir/Dialect/TT/IR/TT.h"
-#include "ttmlir/Target/Common/system_desc_generated.h"
+#include "ttmlir/Target/Common/Target.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/TypeSwitch.h"

diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp
@@ -208,14 +208,21 @@ createOp(FlatbufferObjectCache &cache, EmptyOp op) {
   ::tt::target::TensorLayout layout =
       ::tt::mlir::ttnn::utils::toTargetTensorLayout(op.getLayout().value());
 
+  uint32_t numShards = 1;
+  ::tt::target::DistributedTensorConfig distributionType =
+      ::tt::target::DistributedTensorConfig::NONE;
+  ::flatbuffers::Offset<void> distribution = 0;
+  flatbuffers::Offset<::tt::target::DistributionStrategy> strategy =
+      ::tt::target::CreateDistributionStrategy(*cache.fbb, distributionType,
+                                               distribution);
   auto output = getOperandThroughDPSOps(op.getResult());
 
   // If the device is not set, we create on host
   //
   if (!op.getDevice()) {
     return ::tt::target::ttnn::CreateEmptyOp(
         *cache.fbb, cache.fbb->CreateVector<int64_t>(shape), dtype, layout,
-        /* device */ 0, /* memcfg */ 0,
+        numShards, /* device */ 0, /* memcfg */ 0, strategy,
         cache.getOrCreate(output, tensorValueToFlatbuffer,
                           kHostAllocatedAddress, kHostAllocatedSize));
   }
@@ -227,7 +234,8 @@ createOp(FlatbufferObjectCache &cache, EmptyOp op) {
 
   return ::tt::target::ttnn::CreateEmptyOp(
       *cache.fbb, cache.fbb->CreateVector<int64_t>(shape), dtype, layout,
-      cache.at<::tt::target::DeviceRef>(device), memoryConfigDesc,
+      numShards, cache.at<::tt::target::DeviceRef>(device), memoryConfigDesc,
+      strategy,
       cache.getOrCreate(output, tensorValueToFlatbuffer, kHostAllocatedAddress,
                         kHostAllocatedSize));
 }
@@ -237,8 +245,16 @@ createOp(FlatbufferObjectCache &cache, FullOp op) {
   auto device = getOperandThroughDPSOps(op.getDevice());
   auto fillValue = op.getFillValue().convertToFloat();
   auto output = getOperandThroughDPSOps(op.getResult());
+  uint32_t numShards = 1;
+  ::tt::target::DistributedTensorConfig distributionType =
+      ::tt::target::DistributedTensorConfig::NONE;
+  ::flatbuffers::Offset<void> distribution = 0;
+  flatbuffers::Offset<::tt::target::DistributionStrategy> strategy =
+      ::tt::target::CreateDistributionStrategy(*cache.fbb, distributionType,
+                                               distribution);
   return ::tt::target::ttnn::CreateFullOp(
       *cache.fbb, cache.at<::tt::target::DeviceRef>(device), fillValue,
+      numShards, strategy,
       cache.getOrCreate(output, tensorValueToFlatbuffer, kHostAllocatedAddress,
                         kHostAllocatedSize));
 }

diff --git a/python/TTModule.cpp b/python/TTModule.cpp
@@ -11,7 +11,7 @@
 #include "mlir/CAPI/IR.h"
 
 #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
-#include "ttmlir/Target/Common/types_generated.h"
+#include "ttmlir/Target/Common/Target.h"
 #include "ttmlir/Utils.h"
 
 namespace mlir::ttmlir::python {

diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h
@@ -45,6 +45,7 @@
 #pragma clang diagnostic pop
 
 #include "tt/runtime/types.h"
+#include "tt/runtime/utils.h"
 #include "ttmlir/Target/TTMetal/Target.h"
 
 namespace tt::runtime::ttmetal {

diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h
@@ -62,6 +62,7 @@
 #include "ttnn/operations/normalization/softmax/softmax.hpp"
 #include "ttnn/operations/pool/maxpool/max_pool2d.hpp"
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
+#include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
 #pragma clang diagnostic pop
@@ -81,11 +82,25 @@ Tensor createTensor(std::shared_ptr<void> data,
                     std::vector<std::uint32_t> const &stride,
                     std::uint32_t itemsize, ::tt::target::DataType dataType);
 
+Tensor
+createTensor(std::vector<std::shared_ptr<void>> &data,
+             std::vector<std::uint32_t> const &shape,
+             std::vector<std::uint32_t> const &stride, std::uint32_t itemsize,
+             ::tt::target::DataType dataType,
+             std::unordered_map<std::string, std::string> const &stratagy);
+
 inline Tensor createTensor(std::shared_ptr<void> data, TensorDesc const &desc) {
   return createTensor(data, desc.shape, desc.stride, desc.itemsize,
                       desc.dataType);
 }
 
+inline Tensor
+createTensor(std::vector<std::shared_ptr<void>> &data, TensorDesc const &desc,
+             std::unordered_map<std::string, std::string> const &stratagy) {
+  return createTensor(data, desc.shape, desc.stride, desc.itemsize,
+                      desc.dataType, stratagy);
+}
+
 tt::target::DataType getTensorDataType(Tensor tensor);
 
 size_t getNumAvailableDevices();

diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h
@@ -36,11 +36,25 @@ Tensor createTensor(std::shared_ptr<void> data,
                     std::vector<std::uint32_t> const &stride,
                     std::uint32_t itemsize, ::tt::target::DataType dataType);
 
+Tensor
+createTensor(std::vector<std::shared_ptr<void>> &data,
+             std::vector<std::uint32_t> const &shape,
+             std::vector<std::uint32_t> const &stride, std::uint32_t itemsize,
+             ::tt::target::DataType dataType,
+             std::unordered_map<std::string, std::string> const &stratagy);
+
 inline Tensor createTensor(std::shared_ptr<void> data, TensorDesc const &desc) {
   return createTensor(data, desc.shape, desc.stride, desc.itemsize,
                       desc.dataType);
 }
 
+inline Tensor
+createTensor(std::vector<std::shared_ptr<void>> &data, TensorDesc const &desc,
+             std::unordered_map<std::string, std::string> const &stratagy) {
+  return createTensor(data, desc.shape, desc.stride, desc.itemsize,
+                      desc.dataType, stratagy);
+}
+
 tt::target::DataType getTensorDataType(Tensor tensor);
 
 size_t getNumAvailableDevices();

diff --git a/runtime/include/tt/runtime/types.h b/runtime/include/tt/runtime/types.h
@@ -10,9 +10,11 @@
 #include <string_view>
 #include <vector>
 
-#include "tt/runtime/utils.h"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcovered-switch-default"
 #include "ttmlir/Target/Common/system_desc_generated.h"
 #include "ttmlir/Target/Common/types_generated.h"
+#pragma clang diagnostic pop
 
 namespace tt::runtime {
 

diff --git a/runtime/include/tt/runtime/utils.h b/runtime/include/tt/runtime/utils.h
@@ -7,7 +7,10 @@
 
 #include <memory>
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcovered-switch-default"
 #include "ttmlir/Target/Common/types_generated.h"
+#pragma clang diagnostic pop
 
 namespace tt::runtime::utils {
 

diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp
@@ -124,6 +124,30 @@ Tensor createTensor(std::shared_ptr<void> data,
   throw std::runtime_error("runtime is not enabled");
 }
 
+Tensor
+createTensor(std::vector<std::shared_ptr<void>> &data,
+             std::vector<std::uint32_t> const &shape,
+             std::vector<std::uint32_t> const &stride, std::uint32_t itemsize,
+             ::tt::target::DataType dataType,
+             std::unordered_map<std::string, std::string> const &stratagy) {
+  LOG_ASSERT(not shape.empty());
+  LOG_ASSERT(not stride.empty());
+  LOG_ASSERT(itemsize > 0);
+#if defined(TT_RUNTIME_ENABLE_TTNN)
+  if (getCurrentRuntime() == DeviceRuntime::TTNN) {
+    return ::tt::runtime::ttnn::createTensor(data, shape, stride, itemsize,
+                                             dataType, stratagy);
+  }
+#endif
+
+#if defined(TT_RUNTIME_ENABLE_TTMETAL)
+  if (getCurrentRuntime() == DeviceRuntime::TTMetal) {
+    throw std::runtime_error("Not implemented");
+  }
+#endif
+  throw std::runtime_error("runtime is not enabled");
+}
+
 tt::target::DataType getTensorDataType(Tensor tensor) {
 #if defined(TT_RUNTIME_ENABLE_TTNN)
   if (getCurrentRuntime() == DeviceRuntime::TTNN) {