Support dram memory space in metal direct (#584)

This change adds support for the device dram memory space in metal backend. This required a few bits of refactoring including: - `DeviceAttr` splits out worker grid from L1 map, they now respectively map the physical compute cores and the physical L1 memory map. - `DeviceAttr` gets a new map, `dramMap`, which maps a linear tensor coordinate to a physical dram address. - Change `projectOnto` to take an arbitrary affine map instead of a grid attr. This let's us pass in unique affine map for L1 or DRAM or Eth (in the future) to the same interface. - `PhysicalCoreCoordMapping` can now take an L1 grid or a DRAM grid. - Giving explicit enum index names for the DeviceAttr affine map results. - Noc datamovement program can now be generated as reads or writes, necessary for writing to DRAM since dram cores do not have risc. Closes #359
tenstorrent · Sep 4, 2024 · 9c61739 · 9c61739
1 parent abd5bfb
commit 9c61739
Show file tree

Hide file tree

Showing 12 changed files with 518 additions and 186 deletions.
diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
@@ -12,6 +12,25 @@
 #include "ttmlir/Dialect/TT/IR/TTOpsEnums.h.inc"
 
 namespace mlir::tt {
+struct PhysGridResultIdx {
+  enum : int64_t {
+    DeviceIdx = 0,
+    CoreCoordY = 1,
+    CoreCoordX = 2,
+    NumIndices = 3,
+  };
+};
+
+struct MemoryMapResultIdx {
+  enum : int64_t {
+    DeviceIdx = 0,
+    CoreCoordY = 1,
+    CoreCoordX = 2,
+    ShardOffset = 3,
+    NumIndices = 4,
+  };
+};
+
 inline bool isSystemMemorySpace(MemorySpace memorySpace) {
   return memorySpace == MemorySpace::System ||
          memorySpace == MemorySpace::SystemMMIO;

diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
@@ -290,7 +290,7 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
       llvm::SmallVector<int64_t> getStride(ArrayRef<int64_t> logicalShape) const;
       llvm::SmallVector<int64_t> getPhysicalShape(ArrayRef<int64_t> logicalShape) const;
       llvm::SmallVector<int64_t> getShardShape() const;
-      AffineMap projectOnto(AffineMap linearMap, ArrayRef<int64_t> logicalTensorShape, GridAttr grid) const;
+      AffineMap projectOnto(AffineMap linearMap, AffineMap physicalMemoryMap, ArrayRef<int64_t> logicalTensorShape) const;
       AffineMap getIdentityTileLinearMap() const;
       llvm::SmallVector<int64_t> getTiledShape(ArrayRef<int64_t> logicalTensorShape) const;
   }];
@@ -321,18 +321,33 @@ def TT_BufferAttr : TT_Attr<"Buffer", "buffer", []> {
 }
 
 def TT_DeviceAttr : TT_Attr<"Device", "device", []> {
-  let summary = "Device attribute in TT dialect";
+  let summary = "Device attribute in TT dialect.";
   let description = [{
+    Describes the physical layout of a device in the system and is made up of a few components:
+    - A grid attribute that describes the device's compute grid shape.  It not only describes the shape of the compute grid, but also
+      carries an affine map that describes how the logical grid maps to the physical grid.
+    - Two affine maps that describe how a tensor layout's linear attribute maps to the L1 and DRAM memory spaces.
+    - An array of chip ids that this device is made up of.
   }];
-  let parameters = (ins TT_GridAttr:$grid, ArrayRefParameter<"unsigned">:$chipIds);
-  let assemblyFormat = "`<` qualified($grid) `,` `[` $chipIds `]` `>`";
+  let parameters = (ins TT_GridAttr:$workerGrid,
+                        "AffineMap":$l1Map,
+                        "AffineMap":$dramMap,
+                        ArrayRefParameter<"unsigned">:$chipIds);
+  let assemblyFormat = "`<` `workerGrid` `=` qualified($workerGrid) `,` `l1Map` `=` qualified($l1Map) `,` `dramMap` `=` qualified($dramMap) `,` `chipIds` `=` `[` $chipIds `]` `>`";
 
   let extraClassDeclaration = [{
-      static DeviceAttr get(::mlir::MLIRContext *context, ArrayRef<int64_t> shape, AffineMap physicalGridMapping, ArrayRef<unsigned> chipIds) {
-        return DeviceAttr::get(context, GridAttr::get(context, shape, physicalGridMapping), chipIds);
-      }
       static DeviceAttr get(::mlir::MLIRContext *context, SystemDescAttr systemDesc, ArrayRef<unsigned> chipIds);
       static DeviceAttr get(::mlir::MLIRContext *context, SystemDescAttr systemDesc, bool enableMultichip = false);
+      AffineMap getMapForMemorySpace(MemorySpace memorySpace) const {
+        switch (memorySpace) {
+        case MemorySpace::DeviceL1:
+          return getL1Map();
+        case MemorySpace::DeviceDRAM:
+          return getDramMap();
+        default:
+          llvm_unreachable("Unsupported memory space");
+        }
+      }
   }];
 
   let genVerifyDecl = 1;

diff --git a/include/ttmlir/Dialect/TT/Utils/PhysicalCoreCoord.h b/include/ttmlir/Dialect/TT/Utils/PhysicalCoreCoord.h
@@ -21,12 +21,20 @@ struct PhysicalCoreCoord {
 
   std::int64_t &operator[](std::size_t i) {
     assert(i < 3);
-    return i == 0 ? d : i == 1 ? y : x;
+    switch (i) {
+    case 0:
+      return d;
+    case 1:
+      return y;
+    case 2:
+      return x;
+    default:
+      llvm_unreachable("invalid index");
+    }
   }
 
   std::int64_t operator[](std::size_t i) const {
-    assert(i < 3);
-    return i == 0 ? d : i == 1 ? y : x;
+    return (*const_cast<PhysicalCoreCoord *>(this))[i];
   }
 
   bool operator==(PhysicalCoreCoord const &other) const {
@@ -36,32 +44,79 @@ struct PhysicalCoreCoord {
 
 class PhysicalCoreCoordMapping {
 public:
-  PhysicalCoreCoordMapping(ArrayRef<tt::ChipDescAttr> chipDescs) {
-    ArrayRef<int64_t> firstChipGrid = chipDescs.front().getGrid();
+  static PhysicalCoreCoordMapping
+  getWorkerMapping(ArrayRef<unsigned> chipIds,
+                   ArrayRef<tt::ChipDescAttr> chipDescs) {
+    SmallVector<std::array<int64_t, 2>> physCores;
+    ArrayRef<int64_t> firstChipGrid = chipDescs[chipIds.front()].getGrid();
     assert(firstChipGrid.size() == 2);
-    grid = {firstChipGrid[0], firstChipGrid[1]};
+    std::array<int64_t, 2> grid = {firstChipGrid[0], firstChipGrid[1]};
 
-    workers.reserve(chipDescs.size() * grid[0] * grid[1]);
-    for (auto chipDesc : chipDescs) {
+    physCores.reserve(chipIds.size() * grid[0] * grid[1]);
+    for (auto chipId : chipIds) {
+      auto chipDesc = chipDescs[chipId];
       auto chipGrid = chipDesc.getGrid();
       assert(chipGrid == firstChipGrid);
       ChipPhysicalCoresAttr chipPhysicalCores = chipDesc.getChipPhysicalCores();
       assert(chipPhysicalCores.getWorker().size() ==
              static_cast<size_t>(grid[0] * grid[1]));
       for (auto worker : chipPhysicalCores.getWorker()) {
-        workers.push_back({worker.getY(), worker.getX()});
+        physCores.push_back({worker.getY(), worker.getX()});
       }
     }
-    assert(workers.size() == chipDescs.size() * grid[0] * grid[1]);
+    assert(physCores.size() == chipIds.size() * grid[0] * grid[1]);
+    return PhysicalCoreCoordMapping(grid, physCores);
+  }
+
+  static PhysicalCoreCoordMapping
+  getDramMapping(ArrayRef<unsigned> chipIds,
+                 ArrayRef<tt::ChipDescAttr> chipDescs) {
+    ArrayRef<CoreCoordAttr> firstChipDramCores =
+        chipDescs[chipIds.front()].getChipPhysicalCores().getDram();
+
+    std::array<int64_t, 2> grid = {
+        1, static_cast<int64_t>(firstChipDramCores.size())};
+    SmallVector<std::array<int64_t, 2>> physCores;
+    physCores.reserve(chipIds.size() * grid[0] * grid[1]);
+    for (auto chipId : chipIds) {
+      auto chipDesc = chipDescs[chipId];
+      ChipPhysicalCoresAttr chipPhysicalCores = chipDesc.getChipPhysicalCores();
+      assert(chipPhysicalCores.getDram().size() ==
+             static_cast<size_t>(grid[0] * grid[1]));
+      for (auto dram : chipPhysicalCores.getDram()) {
+        physCores.push_back({dram.getY(), dram.getX()});
+      }
+    }
+    assert(physCores.size() == chipIds.size() * grid[0] * grid[1]);
+    return PhysicalCoreCoordMapping(grid, physCores);
+  }
+
+  static PhysicalCoreCoordMapping
+  getMemorySpaceMapping(ArrayRef<unsigned> chipIds,
+                        ArrayRef<tt::ChipDescAttr> chipDescs,
+                        MemorySpace memorySpace) {
+    switch (memorySpace) {
+    case MemorySpace::DeviceL1:
+      return getWorkerMapping(chipIds, chipDescs);
+    case MemorySpace::DeviceDRAM:
+      return getDramMapping(chipIds, chipDescs);
+    default:
+      llvm_unreachable("unsupported memory space");
+    }
   }
 
   std::array<int64_t, 2> operator[](PhysicalCoreCoord coord) const {
-    return workers[coord.d * grid[0] * grid[1] + coord.y * grid[1] + coord.x];
+    return physCores[coord.d * grid[0] * grid[1] + coord.y * grid[1] + coord.x];
   }
 
+private:
+  PhysicalCoreCoordMapping(std::array<int64_t, 2> grid,
+                           SmallVector<std::array<int64_t, 2>> physCores)
+      : grid(grid), physCores(physCores) {}
+
 private:
   std::array<int64_t, 2> grid;
-  SmallVector<std::array<int64_t, 2>> workers;
+  SmallVector<std::array<int64_t, 2>> physCores;
 };
 } // namespace mlir::tt
 

diff --git a/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h b/include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
@@ -289,19 +289,23 @@ toFlatbuffer(FlatbufferObjectCache &cache, GridAttr tensorGrid,
   ::ttmlir::utils::sample(
       tensorGridShape, [&](ArrayRef<std::int64_t> virtualCoreCoord) {
         SmallVector<std::int64_t> coreCoord = mapping.compose(virtualCoreCoord);
-        assert(coreCoord.size() == 3 && "expected a 2D core");
-        assert(coreCoord[0] == 0 && "expected single device");
+        assert(coreCoord.size() == PhysGridResultIdx::NumIndices &&
+               "expected a 2D core");
+        assert(coreCoord[PhysGridResultIdx::DeviceIdx] == 0 &&
+               "expected single device");
         if (!coreRangeSet.empty() &&
-            ((coreRangeSet.back().loc().y() == coreCoord[1]) &&
+            ((coreRangeSet.back().loc().y() ==
+              coreCoord[PhysGridResultIdx::CoreCoordY]) &&
              (coreRangeSet.back().loc().x() + coreRangeSet.back().size().x()) ==
-                 coreCoord[2])) {
+                 coreCoord[PhysGridResultIdx::CoreCoordX])) {
           coreRangeSet.back() = ::tt::target::Dim2dRange(
               coreRangeSet.back().loc(),
               ::tt::target::Dim2d(coreRangeSet.back().size().y(),
                                   coreRangeSet.back().size().x() + 1));
         } else {
           coreRangeSet.push_back(::tt::target::Dim2dRange(
-              ::tt::target::Dim2d(coreCoord[1], coreCoord[2]),
+              ::tt::target::Dim2d(coreCoord[PhysGridResultIdx::CoreCoordY],
+                                  coreCoord[PhysGridResultIdx::CoreCoordX]),
               ::tt::target::Dim2d(1, 1)));
         }
         if (coreRangeSet.size() > 1 &&
@@ -401,7 +405,7 @@ layoutAttrToFlatbuffer(FlatbufferObjectCache &cache, Attribute attr,
   auto strideInt64 = layoutAttr.getStride(logicalShape);
   std::vector<int32_t> stride(strideInt64.begin(), strideInt64.end());
   auto coreRangeSet =
-      toFlatbuffer(cache, layoutAttr.getGrid(), deviceAttr.getGrid());
+      toFlatbuffer(cache, layoutAttr.getGrid(), deviceAttr.getWorkerGrid());
   return ::tt::target::CreateLayoutDescDirect(
       *cache.fbb, &stride, toFlatbuffer(cache, layoutAttr.getOobVal()),
       &coreRangeSet,

diff --git a/include/ttmlir/Utils.h b/include/ttmlir/Utils.h
@@ -15,6 +15,10 @@ template <typename T> T alignUp(T ptr, T alignment) {
   return (ptr + alignment - 1) & ~(alignment - 1);
 }
 
+template <typename T> T alignDown(T ptr, T alignment) {
+  return ptr & ~(alignment - 1);
+}
+
 template <typename Vector, typename Fn>
 inline void sample(Vector const &shape, Fn fn) {
   llvm::SmallVector<std::int64_t, 8> strides(shape.size());