Skip to content

Commit

Permalink
Support dram memory space in metal direct (#584)
Browse files Browse the repository at this point in the history
This change adds support for the device dram memory space in metal
backend.  This required a few bits of refactoring including:

- `DeviceAttr` splits out worker grid from L1 map, they now respectively
  map the physical compute cores and the physical L1 memory map.
- `DeviceAttr` gets a new map, `dramMap`, which maps a linear tensor
  coordinate to a physical dram address.
- Change `projectOnto` to take an arbitrary affine map instead of a grid
  attr.  This let's us pass in unique affine map for L1 or DRAM or Eth
  (in the future) to the same interface.
- `PhysicalCoreCoordMapping` can now take an L1 grid or a DRAM grid.
- Giving explicit enum index names for the DeviceAttr affine map
  results.
- Noc datamovement program can now be generated as reads or writes,
  necessary for writing to DRAM since dram cores do not have risc.

Closes #359
  • Loading branch information
nsmithtt authored Sep 4, 2024
1 parent abd5bfb commit 9c61739
Show file tree
Hide file tree
Showing 12 changed files with 518 additions and 186 deletions.
19 changes: 19 additions & 0 deletions include/ttmlir/Dialect/TT/IR/TTOpsTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,25 @@
#include "ttmlir/Dialect/TT/IR/TTOpsEnums.h.inc"

namespace mlir::tt {
struct PhysGridResultIdx {
enum : int64_t {
DeviceIdx = 0,
CoreCoordY = 1,
CoreCoordX = 2,
NumIndices = 3,
};
};

struct MemoryMapResultIdx {
enum : int64_t {
DeviceIdx = 0,
CoreCoordY = 1,
CoreCoordX = 2,
ShardOffset = 3,
NumIndices = 4,
};
};

inline bool isSystemMemorySpace(MemorySpace memorySpace) {
return memorySpace == MemorySpace::System ||
memorySpace == MemorySpace::SystemMMIO;
Expand Down
29 changes: 22 additions & 7 deletions include/ttmlir/Dialect/TT/IR/TTOpsTypes.td
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> {
llvm::SmallVector<int64_t> getStride(ArrayRef<int64_t> logicalShape) const;
llvm::SmallVector<int64_t> getPhysicalShape(ArrayRef<int64_t> logicalShape) const;
llvm::SmallVector<int64_t> getShardShape() const;
AffineMap projectOnto(AffineMap linearMap, ArrayRef<int64_t> logicalTensorShape, GridAttr grid) const;
AffineMap projectOnto(AffineMap linearMap, AffineMap physicalMemoryMap, ArrayRef<int64_t> logicalTensorShape) const;
AffineMap getIdentityTileLinearMap() const;
llvm::SmallVector<int64_t> getTiledShape(ArrayRef<int64_t> logicalTensorShape) const;
}];
Expand Down Expand Up @@ -321,18 +321,33 @@ def TT_BufferAttr : TT_Attr<"Buffer", "buffer", []> {
}

def TT_DeviceAttr : TT_Attr<"Device", "device", []> {
let summary = "Device attribute in TT dialect";
let summary = "Device attribute in TT dialect.";
let description = [{
Describes the physical layout of a device in the system and is made up of a few components:
- A grid attribute that describes the device's compute grid shape. It not only describes the shape of the compute grid, but also
carries an affine map that describes how the logical grid maps to the physical grid.
- Two affine maps that describe how a tensor layout's linear attribute maps to the L1 and DRAM memory spaces.
- An array of chip ids that this device is made up of.
}];
let parameters = (ins TT_GridAttr:$grid, ArrayRefParameter<"unsigned">:$chipIds);
let assemblyFormat = "`<` qualified($grid) `,` `[` $chipIds `]` `>`";
let parameters = (ins TT_GridAttr:$workerGrid,
"AffineMap":$l1Map,
"AffineMap":$dramMap,
ArrayRefParameter<"unsigned">:$chipIds);
let assemblyFormat = "`<` `workerGrid` `=` qualified($workerGrid) `,` `l1Map` `=` qualified($l1Map) `,` `dramMap` `=` qualified($dramMap) `,` `chipIds` `=` `[` $chipIds `]` `>`";

let extraClassDeclaration = [{
static DeviceAttr get(::mlir::MLIRContext *context, ArrayRef<int64_t> shape, AffineMap physicalGridMapping, ArrayRef<unsigned> chipIds) {
return DeviceAttr::get(context, GridAttr::get(context, shape, physicalGridMapping), chipIds);
}
static DeviceAttr get(::mlir::MLIRContext *context, SystemDescAttr systemDesc, ArrayRef<unsigned> chipIds);
static DeviceAttr get(::mlir::MLIRContext *context, SystemDescAttr systemDesc, bool enableMultichip = false);
AffineMap getMapForMemorySpace(MemorySpace memorySpace) const {
switch (memorySpace) {
case MemorySpace::DeviceL1:
return getL1Map();
case MemorySpace::DeviceDRAM:
return getDramMap();
default:
llvm_unreachable("Unsupported memory space");
}
}
}];

let genVerifyDecl = 1;
Expand Down
79 changes: 67 additions & 12 deletions include/ttmlir/Dialect/TT/Utils/PhysicalCoreCoord.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,20 @@ struct PhysicalCoreCoord {

std::int64_t &operator[](std::size_t i) {
assert(i < 3);
return i == 0 ? d : i == 1 ? y : x;
switch (i) {
case 0:
return d;
case 1:
return y;
case 2:
return x;
default:
llvm_unreachable("invalid index");
}
}

std::int64_t operator[](std::size_t i) const {
assert(i < 3);
return i == 0 ? d : i == 1 ? y : x;
return (*const_cast<PhysicalCoreCoord *>(this))[i];
}

bool operator==(PhysicalCoreCoord const &other) const {
Expand All @@ -36,32 +44,79 @@ struct PhysicalCoreCoord {

class PhysicalCoreCoordMapping {
public:
PhysicalCoreCoordMapping(ArrayRef<tt::ChipDescAttr> chipDescs) {
ArrayRef<int64_t> firstChipGrid = chipDescs.front().getGrid();
static PhysicalCoreCoordMapping
getWorkerMapping(ArrayRef<unsigned> chipIds,
ArrayRef<tt::ChipDescAttr> chipDescs) {
SmallVector<std::array<int64_t, 2>> physCores;
ArrayRef<int64_t> firstChipGrid = chipDescs[chipIds.front()].getGrid();
assert(firstChipGrid.size() == 2);
grid = {firstChipGrid[0], firstChipGrid[1]};
std::array<int64_t, 2> grid = {firstChipGrid[0], firstChipGrid[1]};

workers.reserve(chipDescs.size() * grid[0] * grid[1]);
for (auto chipDesc : chipDescs) {
physCores.reserve(chipIds.size() * grid[0] * grid[1]);
for (auto chipId : chipIds) {
auto chipDesc = chipDescs[chipId];
auto chipGrid = chipDesc.getGrid();
assert(chipGrid == firstChipGrid);
ChipPhysicalCoresAttr chipPhysicalCores = chipDesc.getChipPhysicalCores();
assert(chipPhysicalCores.getWorker().size() ==
static_cast<size_t>(grid[0] * grid[1]));
for (auto worker : chipPhysicalCores.getWorker()) {
workers.push_back({worker.getY(), worker.getX()});
physCores.push_back({worker.getY(), worker.getX()});
}
}
assert(workers.size() == chipDescs.size() * grid[0] * grid[1]);
assert(physCores.size() == chipIds.size() * grid[0] * grid[1]);
return PhysicalCoreCoordMapping(grid, physCores);
}

static PhysicalCoreCoordMapping
getDramMapping(ArrayRef<unsigned> chipIds,
ArrayRef<tt::ChipDescAttr> chipDescs) {
ArrayRef<CoreCoordAttr> firstChipDramCores =
chipDescs[chipIds.front()].getChipPhysicalCores().getDram();

std::array<int64_t, 2> grid = {
1, static_cast<int64_t>(firstChipDramCores.size())};
SmallVector<std::array<int64_t, 2>> physCores;
physCores.reserve(chipIds.size() * grid[0] * grid[1]);
for (auto chipId : chipIds) {
auto chipDesc = chipDescs[chipId];
ChipPhysicalCoresAttr chipPhysicalCores = chipDesc.getChipPhysicalCores();
assert(chipPhysicalCores.getDram().size() ==
static_cast<size_t>(grid[0] * grid[1]));
for (auto dram : chipPhysicalCores.getDram()) {
physCores.push_back({dram.getY(), dram.getX()});
}
}
assert(physCores.size() == chipIds.size() * grid[0] * grid[1]);
return PhysicalCoreCoordMapping(grid, physCores);
}

static PhysicalCoreCoordMapping
getMemorySpaceMapping(ArrayRef<unsigned> chipIds,
ArrayRef<tt::ChipDescAttr> chipDescs,
MemorySpace memorySpace) {
switch (memorySpace) {
case MemorySpace::DeviceL1:
return getWorkerMapping(chipIds, chipDescs);
case MemorySpace::DeviceDRAM:
return getDramMapping(chipIds, chipDescs);
default:
llvm_unreachable("unsupported memory space");
}
}

std::array<int64_t, 2> operator[](PhysicalCoreCoord coord) const {
return workers[coord.d * grid[0] * grid[1] + coord.y * grid[1] + coord.x];
return physCores[coord.d * grid[0] * grid[1] + coord.y * grid[1] + coord.x];
}

private:
PhysicalCoreCoordMapping(std::array<int64_t, 2> grid,
SmallVector<std::array<int64_t, 2>> physCores)
: grid(grid), physCores(physCores) {}

private:
std::array<int64_t, 2> grid;
SmallVector<std::array<int64_t, 2>> workers;
SmallVector<std::array<int64_t, 2>> physCores;
};
} // namespace mlir::tt

Expand Down
16 changes: 10 additions & 6 deletions include/ttmlir/Target/Utils/MLIRToFlatbuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,19 +289,23 @@ toFlatbuffer(FlatbufferObjectCache &cache, GridAttr tensorGrid,
::ttmlir::utils::sample(
tensorGridShape, [&](ArrayRef<std::int64_t> virtualCoreCoord) {
SmallVector<std::int64_t> coreCoord = mapping.compose(virtualCoreCoord);
assert(coreCoord.size() == 3 && "expected a 2D core");
assert(coreCoord[0] == 0 && "expected single device");
assert(coreCoord.size() == PhysGridResultIdx::NumIndices &&
"expected a 2D core");
assert(coreCoord[PhysGridResultIdx::DeviceIdx] == 0 &&
"expected single device");
if (!coreRangeSet.empty() &&
((coreRangeSet.back().loc().y() == coreCoord[1]) &&
((coreRangeSet.back().loc().y() ==
coreCoord[PhysGridResultIdx::CoreCoordY]) &&
(coreRangeSet.back().loc().x() + coreRangeSet.back().size().x()) ==
coreCoord[2])) {
coreCoord[PhysGridResultIdx::CoreCoordX])) {
coreRangeSet.back() = ::tt::target::Dim2dRange(
coreRangeSet.back().loc(),
::tt::target::Dim2d(coreRangeSet.back().size().y(),
coreRangeSet.back().size().x() + 1));
} else {
coreRangeSet.push_back(::tt::target::Dim2dRange(
::tt::target::Dim2d(coreCoord[1], coreCoord[2]),
::tt::target::Dim2d(coreCoord[PhysGridResultIdx::CoreCoordY],
coreCoord[PhysGridResultIdx::CoreCoordX]),
::tt::target::Dim2d(1, 1)));
}
if (coreRangeSet.size() > 1 &&
Expand Down Expand Up @@ -401,7 +405,7 @@ layoutAttrToFlatbuffer(FlatbufferObjectCache &cache, Attribute attr,
auto strideInt64 = layoutAttr.getStride(logicalShape);
std::vector<int32_t> stride(strideInt64.begin(), strideInt64.end());
auto coreRangeSet =
toFlatbuffer(cache, layoutAttr.getGrid(), deviceAttr.getGrid());
toFlatbuffer(cache, layoutAttr.getGrid(), deviceAttr.getWorkerGrid());
return ::tt::target::CreateLayoutDescDirect(
*cache.fbb, &stride, toFlatbuffer(cache, layoutAttr.getOobVal()),
&coreRangeSet,
Expand Down
4 changes: 4 additions & 0 deletions include/ttmlir/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ template <typename T> T alignUp(T ptr, T alignment) {
return (ptr + alignment - 1) & ~(alignment - 1);
}

template <typename T> T alignDown(T ptr, T alignment) {
return ptr & ~(alignment - 1);
}

template <typename Vector, typename Fn>
inline void sample(Vector const &shape, Fn fn) {
llvm::SmallVector<std::int64_t, 8> strides(shape.size());
Expand Down
Loading

0 comments on commit 9c61739

Please sign in to comment.