Skip to content

Commit

Permalink
Support tt-metal semaphores (#919)
Browse files Browse the repository at this point in the history
Fix #915 

Add support for all tt-metal semaphore functions, as well as some
utility functions that are useful for using semaphores through compiler

For example, code below

```cpp
Value zero = i32(0, builder);
auto sem_id = builder.create<ttkernel::GetArgValOp>(arithOrMathOp.getLoc(), zero);
auto sem_addr = builder.create<ttkernel::GetSemaphoreOp>(arithOrMathOp.getLoc(), sem_id);
auto sem_l1_ptr = builder.create<ttkernel::CastToL1PtrOp>(arithOrMathOp.getLoc(), sem_addr);
builder.create<ttkernel::NocSemaphoreWaitOp>(arithOrMathOp.getLoc(), sem_l1_ptr, zero);
```

will generate kernel code

```cpp
int32_t v5 = get_arg_val<uint32_t>(v4);
int32_t v6 = get_semaphore(v5);
volatile tt_l1_ptr uint32_t*  v7 = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(v6);
noc_semaphore_wait(v7, v4);
```

### TODO

- [ ] Implement conversion to `tt_l1_ptr` properly

---------

Co-authored-by: Jacob DeSousa <jdesousa@tenstorrent.com>
  • Loading branch information
pjanevskiTT and jdesousa-TT authored Jan 22, 2025
1 parent 16549e2 commit 4ba1b73
Show file tree
Hide file tree
Showing 5 changed files with 259 additions and 83 deletions.
167 changes: 147 additions & 20 deletions include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,153 @@ def TTKernel_NocAsyncWriteBarrierOp : TTKernel_Op<"noc_async_write_barrier"> {
}];
}

def TTKernel_GetSemaphoreOp : TTKernel_Op<"get_semaphore"> {
let summary = "GetSemaphoreOp";
let description = [{
Get L1 addr of the semaphore with specified semaphore id
}];

let arguments = (ins I32:$semaphore_id);
let results = (outs TTKernel_L1Addr:$sem_addr);
}

def TTKernel_NocSemaphoreIncOp : TTKernel_Op<"noc_semaphore_inc"> {
let summary = "NocSemaphoreInc";
let description = [{
The Tensix core executing this function call initiates an atomic increment
(with 32-bit wrap) of a remote Tensix core L1 memory address. This L1 memory
address is used as a semaphore of size 4 Bytes, as a synchronization
mechanism.
}];

let arguments = (ins TTKernel_NocAddr:$addr, I32:$incr, I32:$noc_id);
}

def TTKernel_NocSemaphoreSetOp : TTKernel_Op<"noc_semaphore_set"> {
let summary = "NocSemaphoreSet";
let description = [{
Sets the value of a local L1 memory address on the Tensix core executing
this function to a specific value. This L1 memory address is used as a
semaphore of size 4 Bytes, as a synchronization mechanism. Also, see
*noc_semaphore_wait*.
}];

let arguments = (ins TTKernel_L1AddrPtr:$sem_addr, I32:$val);
}

def TTKernel_NocSemaphoreWaitOp : TTKernel_Op<"noc_semaphore_wait"> {
let summary = "NocSemaphoreWait";
let description = [{
A blocking call that waits until the value of a local L1 memory address on
the Tensix core executing this function becomes equal to a target value.
This L1 memory address is used as a semaphore of size 4 Bytes, as a
synchronization mechanism. Also, see *noc_semaphore_set*.
}];

let arguments = (ins TTKernel_L1AddrPtr:$sem_addr, I32:$val);
}

def TTKernel_NocSemaphoreWaitMinOp : TTKernel_Op<"noc_semaphore_wait_min"> {
let summary = "NocSemaphoreWaitMin";
let description = [{
A blocking call that waits until the value of a local L1 memory address on
the Tensix core executing this function becomes equal or greater than a target value.
This L1 memory address is used as a semaphore of size 4 Bytes, as a
synchronization mechanism. Also, see *noc_semaphore_set*.
}];

let arguments = (ins TTKernel_L1AddrPtr:$sem_addr, I32:$val);
}

def TTKernel_NocSemaphoreSetMulticastOp : TTKernel_Op<"noc_semaphore_set_multicast"> {
let summary = "NocSemaphoreSetMulticast";
let description = [{
Initiates an asynchronous write from a source address in L1 memory on the
Tensix core executing this function call to a rectangular destination grid.
The destinations are specified using a uint64_t encoding referencing an
on-chip grid of nodes located at NOC coordinate range
(x_start,y_start,x_end,y_end) and a local address created using
*get_noc_multicast_addr* function. The size of data that is sent is 4 Bytes.
This is usually used to set a semaphore value at the destination nodes, as a
way of a synchronization mechanism. The same as *noc_async_write_multicast*
with preset size of 4 Bytes.
With this API, the multicast sender cannot be part of the multicast
destinations. If the multicast sender has to be in the multicast
destinations (i.e. must perform a local L1 write), the other API variant
*noc_semaphore_set_multicast_loopback_src* can be used.
}];

let arguments = (ins TTKernel_L1Addr:$src_local_l1_addr,
TTKernel_NocAddr:$dst_noc_addr_multicast,
I32:$num_dests,
BoolAttr:$linked,
BoolAttr:$multicast_path_reserve);
}

def TTKernel_NocSemaphoreSetMulticastLoopbackOp : TTKernel_Op<"noc_semaphore_set_multicast_loopback_src"> {
let summary = "NocSemaphoreSetMulticastLoopback";
let description = [{
Initiates an asynchronous write from a source address in L1 memory on the
Tensix core executing this function call to a rectangular destination grid.
The destinations are specified using a uint64_t encoding referencing an
on-chip grid of nodes located at NOC coordinate range
(x_start,y_start,x_end,y_end) and a local address created using
*get_noc_multicast_addr* function. The size of data that is sent is 4 Bytes.
This is usually used to set a semaphore value at the destination nodes, as a
way of a synchronization mechanism. The same as *noc_async_write_multicast*
with preset size of 4 Bytes.
Note: With this API, sending data only to the source node (when num_dests
is 1) may result in unexpected behaviour. For some parameters, hangs have
been observed. For some other parameters, nothing may happen. Consider using
regular non multicast operations such as *noc_async_write* in this case.
}];

let arguments = (ins TTKernel_L1Addr:$src_local_l1_addr,
TTKernel_NocAddr:$dst_noc_addr_multicast,
I32:$num_dests,
BoolAttr:$linked,
BoolAttr:$multicast_path_reserve);
}

//===----------------------------------------------------------------------===//
// TTKernel Compile and runtime arguments operations
//===----------------------------------------------------------------------===//

def TTKernel_GetArgValOp : TTKernel_Op<"get_arg_val"> {
let summary = "Get runtime arg value.";
let description = [{
Get runtime argument value at specified index.
}];

let arguments = (ins I32:$arg_index);

let results = (outs AnyTypeOf<[TTKernel_Semaphore, I32]>:$arg_val);
}

//===----------------------------------------------------------------------===//
// TTKernel Helper functions
//===----------------------------------------------------------------------===//

def TTKernel_CastToL1PtrOp : TTKernel_Op<"reinterpret_cast<volatile tt_l1_ptr uint32_t*>"> {
let summary = "CastToL1Ptr";
let description = [{
Cast specified addr to L1 pointer.
}];

let arguments = (ins AnyTypeOf<[I32, TTKernel_L1Addr]>:$addr);

let results = (outs TTKernel_L1AddrPtr:$l1_ptr);
}

def TTKernel_StoreToL1Op : TTKernel_Op<"store_to_l1"> {
let summary = "StoreToL1";
let description = [{
Store value to L1.
}];

let arguments = (ins I32:$value, TTKernel_L1AddrPtr:$l1_ptr, I32:$offset);
}

//===----------------------------------------------------------------------===//
// TTKernel Multicast NoC operations
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -660,24 +807,4 @@ def TTKernel_GetWritePtrOp : TTKernel_Op<"get_write_ptr"> {
let results = (outs I32:$writePtr);
}

def TTKernel_CastToL1PtrOp : TTKernel_Op<"reinterpret_cast<volatile tt_l1_ptr uint32_t*>"> {
let summary = "CastToL1Ptr";
let description = [{
Cast specified addr to L1 pointer.
}];

let arguments = (ins AnyTypeOf<[I32, TTKernel_L1Addr]>:$addr);

let results = (outs TTKernel_L1AddrPtr:$l1_ptr);
}

def TTKernel_StoreToL1Op : TTKernel_Op<"store_to_l1"> {
let summary = "StoreToL1";
let description = [{
Store value to L1.
}];

let arguments = (ins I32:$value, TTKernel_L1AddrPtr:$l1_ptr, I32:$offset);
}

#endif
7 changes: 7 additions & 0 deletions include/ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.td
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ def TTKernel_CB : TTKernel_Type<"CB", "cb"> {
}];
}

def TTKernel_Semaphore : TTKernel_Type<"Semaphore", "semaphore"> {
let summary = "TTKernel semaphore";
let description = "Semaphore type in TTKernel dialect";
let parameters = (ins "uint32_t":$initial_value);
let assemblyFormat = "`<` $initial_value `>`";
}

def TTKernel_NocAddr : TTKernel_Type<"NocAddr", "noc_addr"> {
let summary = "TTKernel noc address";
let description = "Noc address type in TTKernel dialect";
Expand Down
125 changes: 72 additions & 53 deletions lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ class TTMetalToEmitCFuncArgsRewriter
rewriter.startOpModification(op);
rewriter.setInsertionPointToStart(&op.getCallableRegion()->front());
for (auto arg : blockArgs) {
// Skip initialization if the argument is not a CBType (SemaphoreType)
if (!mlir::isa<ttkernel::CBType>(arg.getType())) {
continue;
}
auto cb = cast<ttkernel::CBType>(arg.getType());
// Get opaque type i.e emitc::LValueType<emitc::OpaqueType>
auto cbType = getTypeConverter()->convertType(cb);
Expand Down Expand Up @@ -276,6 +280,12 @@ class TTMetalToEmitCOpaqueRewriter : public OpConversionPattern<SourceOp> {
template_args.push_back(
emitc::OpaqueAttr::get(op.getContext(), reduceDim));
return ArrayAttr::get(op.getContext(), template_args);
} else if constexpr (std::is_same_v<SourceOp, ttkernel::GetArgValOp>) {
SmallVector<Attribute, 1> template_args;

template_args.push_back(
emitc::OpaqueAttr::get(op.getContext(), "uint32_t"));
return ArrayAttr::get(op.getContext(), template_args);
}
return ArrayAttr();
}
Expand Down Expand Up @@ -381,59 +391,68 @@ class ConvertTTKernelToEmitCPass
target.addLegalOp<func::ReturnOp>();
target.addIllegalDialect<ttkernel::TTKernelDialect>();

patterns
.add<TTMetalToEmitCFuncArgsRewriter, TTMetalToEmitCReturnRewriter,
TTKernelMacroOpToEmitCOpRewriter<ttkernel::MemZerosBaseOp>,
TTKernelMacroOpToEmitCOpRewriter<ttkernel::MemZerosSizeOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::BuiltinOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CopyTileInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::RecipTileInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::RecipTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsAcquireOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsCommitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsWaitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsReleaseOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::PackTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CBPushBackOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CBPopFrontOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CBReserveBackOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CBWaitFrontOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TilizeInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::UntilizeInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TilizeBlockOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::UntilizeBlockOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::BinaryOpInitCommonOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesInitFOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MaxTilesInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MaxTilesOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::ReduceInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::ReduceTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::GetNocAddrOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncReadOp>,
TTMetalToEmitCOpaqueRewriter<
ttkernel::NocAsyncReadOnePacketSetStateOp>,
TTMetalToEmitCOpaqueRewriter<
ttkernel::NocAsyncReadOnePacketWithStateOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncReadBarrierOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteBarrierOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::GetNocMulticastAddrOp>,
TTMetalToEmitCOpaqueRewriter<
ttkernel::NocAsyncWriteMulticastOnePacketOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteMulticastOp>,
TTMetalToEmitCOpaqueRewriter<
ttkernel::NocAsyncWriteMulticastLoopbackSrcOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::UnaryOpInitCommonOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CopyTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::ExpTileInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::ExpTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::GetWritePtrOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CastToL1PtrOp>>(
typeConverter, funcOp.getContext());
patterns.add<
TTMetalToEmitCFuncArgsRewriter, TTMetalToEmitCReturnRewriter,
TTKernelMacroOpToEmitCOpRewriter<ttkernel::MemZerosBaseOp>,
TTKernelMacroOpToEmitCOpRewriter<ttkernel::MemZerosSizeOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::BuiltinOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::GetArgValOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CastToL1PtrOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::GetSemaphoreOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocSemaphoreSetOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocSemaphoreWaitMinOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocSemaphoreIncOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocSemaphoreWaitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocSemaphoreSetMulticastOp>,
TTMetalToEmitCOpaqueRewriter<
ttkernel::NocSemaphoreSetMulticastLoopbackOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CopyTileInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::RecipTileInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::RecipTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsAcquireOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsCommitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsWaitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TileRegsReleaseOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::PackTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CBPushBackOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CBPopFrontOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CBReserveBackOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CBWaitFrontOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TilizeInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::UntilizeInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::TilizeBlockOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::UntilizeBlockOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::BinaryOpInitCommonOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesInitFOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MaxTilesInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::AddTilesOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MulTilesOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::MaxTilesOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::ReduceInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::ReduceTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::GetNocAddrOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncReadOp>,
TTMetalToEmitCOpaqueRewriter<
ttkernel::NocAsyncReadOnePacketSetStateOp>,
TTMetalToEmitCOpaqueRewriter<
ttkernel::NocAsyncReadOnePacketWithStateOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncReadBarrierOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteBarrierOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::GetNocMulticastAddrOp>,
TTMetalToEmitCOpaqueRewriter<
ttkernel::NocAsyncWriteMulticastOnePacketOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::NocAsyncWriteMulticastOp>,
TTMetalToEmitCOpaqueRewriter<
ttkernel::NocAsyncWriteMulticastLoopbackSrcOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::UnaryOpInitCommonOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::CopyTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::ExpTileInitOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::ExpTileOp>,
TTMetalToEmitCOpaqueRewriter<ttkernel::GetWritePtrOp>>(
typeConverter, funcOp.getContext());

patterns.add<TTMetalToEmitCOpaqueRewriter<ttkernel::GetNocAddrXYOp>>(
typeConverter, funcOp.getContext(), "get_noc_addr");
Expand Down
5 changes: 3 additions & 2 deletions lib/Dialect/TTMetal/IR/TTMetalOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,9 @@ ::mlir::LogicalResult DispatchOp::verify() {
// Assert block inputs are CBs
for (auto &region : getRegions()) {
for (auto arg : region.getArguments()) {
if (not mlir::isa<ttkernel::CBType>(arg.getType())) {
return emitOpError("Block inputs must be CBType");
if (!mlir::isa<ttkernel::CBType>(arg.getType()) &&
!mlir::isa<ttkernel::SemaphoreType>(arg.getType())) {
return emitOpError("Block inputs must be CBType or SemType");
}
}
}
Expand Down
Loading

0 comments on commit 4ba1b73

Please sign in to comment.