diff --git a/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td b/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td index c34d416152..94c8759887 100644 --- a/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td +++ b/include/ttmlir/Dialect/TTKernel/IR/TTKernelOps.td @@ -438,6 +438,136 @@ def TTKernel_NocAsyncWriteBarrierOp : TTKernel_Op<"noc_async_write_barrier"> { }]; } +def TTKernel_GetSemaphoreOp : TTKernel_Op<"get_semaphore"> { + let summary = "GetSemaphoreOp"; + let description = [{ + Get L1 addr of the semaphore with specified semaphore id + }]; + + let arguments = (ins I32:$semaphore_id); + let results = (outs TTKernel_L1Addr:$sem_addr); +} + +def TTKernel_NocSemaphoreIncOp : TTKernel_Op<"noc_semaphore_inc"> { + let summary = "NocSemaphoreInc"; + let description = [{ + The Tensix core executing this function call initiates an atomic increment + (with 32-bit wrap) of a remote Tensix core L1 memory address. This L1 memory + address is used as a semaphore of size 4 Bytes, as a synchronization + mechanism. + }]; + + let arguments = (ins TTKernel_NocAddr:$addr, I32:$incr, I32:$noc_id); +} + +def TTKernel_NocSemaphoreSetOp : TTKernel_Op<"noc_semaphore_set"> { + let summary = "NocSemaphoreSet"; + let description = [{ + Sets the value of a local L1 memory address on the Tensix core executing + this function to a specific value. This L1 memory address is used as a + semaphore of size 4 Bytes, as a synchronization mechanism. Also, see + *noc_semaphore_wait*. + }]; + + let arguments = (ins I32:$sem_addr, I32:$val); +} + +def TTKernel_NocSemaphoreWaitOp : TTKernel_Op<"noc_semaphore_wait"> { + let summary = "NocSemaphoreWait"; + let description = [{ + A blocking call that waits until the value of a local L1 memory address on + the Tensix core executing this function becomes equal to a target value. + This L1 memory address is used as a semaphore of size 4 Bytes, as a + synchronization mechanism. Also, see *noc_semaphore_set*. + }]; + + let arguments = (ins TTKernel_L1AddrPtr:$sem_addr, I32:$val); +} + +def TTKernel_NocSemaphoreWaitMinOp : TTKernel_Op<"noc_semaphore_wait_min"> { + let summary = "NocSemaphoreWaitMin"; + let description = [{ + A blocking call that waits until the value of a local L1 memory address on + the Tensix core executing this function becomes equal to a target value. + This L1 memory address is used as a semaphore of size 4 Bytes, as a + synchronization mechanism. Also, see *noc_semaphore_set*. + }]; + + let arguments = (ins TTKernel_L1AddrPtr:$sem_addr, I32:$val); +} + +def TTKernel_NocSemaphoreSetMulticastOp : TTKernel_Op<"noc_semaphore_set_multicast"> { + let summary = "NocSemaphoreSetMulticast"; + let description = [{ + Initiates an asynchronous write from a source address in L1 memory on the + Tensix core executing this function call to a rectangular destination grid. + The destinations are specified using a uint64_t encoding referencing an + on-chip grid of nodes located at NOC coordinate range + (x_start,y_start,x_end,y_end) and a local address created using + *get_noc_multicast_addr* function. The size of data that is sent is 4 Bytes. + This is usually used to set a semaphore value at the destination nodes, as a + way of a synchronization mechanism. The same as *noc_async_write_multicast* + with preset size of 4 Bytes. + With this API, the multicast sender cannot be part of the multicast + destinations. If the multicast sender has to be in the multicast + destinations (i.e. must perform a local L1 write), the other API variant + *noc_semaphore_set_multicast_loopback_src* can be used. + }]; + + let arguments = (ins I32:$src_local_l1_addr, TTKernel_NocAddr:$dst_noc_addr_multicast, I32:$num_dests, BoolAttr:$linked, BoolAttr:$multicast_path_reserve); +} + +def TTKernel_NocSemaphoreSetMulticastLoopbackOp : TTKernel_Op<"noc_semaphore_set_multicast_loopback_src"> { + let summary = "NocSemaphoreSetMulticastLoopback"; + let description = [{ + Initiates an asynchronous write from a source address in L1 memory on the + Tensix core executing this function call to a rectangular destination grid. + The destinations are specified using a uint64_t encoding referencing an + on-chip grid of nodes located at NOC coordinate range + (x_start,y_start,x_end,y_end) and a local address created using + *get_noc_multicast_addr* function. The size of data that is sent is 4 Bytes. + This is usually used to set a semaphore value at the destination nodes, as a + way of a synchronization mechanism. The same as *noc_async_write_multicast* + with preset size of 4 Bytes. + Note: With this API, sending data only to the source node (when num_dests + is 1) may result in unexpected behaviour. For some parameters, hangs have + been observed. For some other parameters, nothing may happen. Consider using + regular non multicast operations such as *noc_async_write* in this case. + }]; + + let arguments = (ins I32:$src_local_l1_addr, TTKernel_NocAddr:$dst_noc_addr_multicast, I32:$num_dests, BoolAttr:$linked, BoolAttr:$multicast_path_reserve); +} + +//===----------------------------------------------------------------------===// +// TTKernel Compile and runtime arguments operations +//===----------------------------------------------------------------------===// + +def TTKernel_GetArgValOp : TTKernel_Op<"get_arg_val"> { + let summary = "Get runtime arg value."; + let description = [{ + Get runtime argument value at specified index. + }]; + + let arguments = (ins I32:$arg_index); + + let results = (outs I32:$arg_val); +} + +//===----------------------------------------------------------------------===// +// TTKernel Helper functions +//===----------------------------------------------------------------------===// + +def TTKernel_CastToL1PtrOp : TTKernel_Op<"reinterpret_cast"> { + let summary = "CastToL1Ptr"; + let description = [{ + Cast specified addr to L1 pointer. + }]; + + let arguments = (ins TTKernel_L1Addr:$addr); + + let results = (outs TTKernel_L1AddrPtr:$l1_ptr); +} + //===----------------------------------------------------------------------===// // TTKernel Misc operations //===----------------------------------------------------------------------===// diff --git a/include/ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.td b/include/ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.td index 2956f314bf..7891ccc5e1 100644 --- a/include/ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.td +++ b/include/ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.td @@ -105,11 +105,28 @@ def TTKernel_CB : TTKernel_Type<"CB", "cb"> { }]; } +def TTKernel_Semaphore : TTKernel_Type<"Semaphore", "semaphore"> { + let summary = "TTKernel semaphore"; + let description = "Semaphore type in TTKernel dialect"; + let parameters = (ins "uint32_t":$initial_value); + let assemblyFormat = "`<` $initial_value `>`"; +} + def TTKernel_NocAddr : TTKernel_Type<"NocAddr", "noc_addr"> { let summary = "TTKernel noc address"; let description = "Noc address type in TTKernel dialect"; } +def TTKernel_L1Addr : TTKernel_Type<"L1Addr", "l1_addr"> { + let summary = "TTKernel l1 address"; + let description = "L1 address type in TTKernel dialect"; +} + +def TTKernel_L1AddrPtr : TTKernel_Type<"L1AddrPtr", "l1_addr_ptr"> { + let summary = "TTKernel l1 address pointer"; + let description = "L1 pointer address type in TTKernel dialect"; +} + def TTKernel_ThreadTypeAttr : EnumAttr { let assemblyFormat = "`<` $value `>`"; } diff --git a/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp b/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp index 3307951971..4424ca2898 100644 --- a/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp +++ b/lib/Conversion/TTKernelToEmitC/TTKernelToEmitC.cpp @@ -117,6 +117,16 @@ class TTKernelToEmitCTypeConverter : public TypeConverter { addConversion([ctx](mlir::tt::ttkernel::CBType type) -> Type { return Builder(ctx).getType("::tt::CB"); }); + addConversion([ctx](mlir::tt::ttkernel::L1AddrType type) -> Type { + return Builder(ctx).getI32Type(); + }); + addConversion([ctx](mlir::tt::ttkernel::L1AddrPtrType type) -> Type { + // TODO: This is a very hacky way to get around the limitation that emitc + // won't emit strings that have * at the back of the string. That is why + // we have space at the end. + return Builder(ctx).getType( + "volatile tt_l1_ptr uint32_t* "); + }); } }; @@ -137,6 +147,10 @@ class TTMetalToEmitCFuncArgsRewriter rewriter.startOpModification(op); rewriter.setInsertionPointToStart(&op.getCallableRegion()->front()); for (auto arg : blockArgs) { + // Skip initialization if the argument is not a CBType (SemaphoreType) + if (!mlir::isa(arg.getType())) { + continue; + } auto cb = cast(arg.getType()); auto cbType = getTypeConverter()->convertType(cb); auto var = rewriter.create( @@ -187,6 +201,18 @@ class TTMetalToEmitCOpaqueRewriter : public OpConversionPattern { return name; } + ArrayAttr getTemplateArgs(SourceOp op) const { + if constexpr (std::is_same_v) { + SmallVector template_args; + + template_args.push_back( + emitc::OpaqueAttr::get(op.getContext(), "uint32_t")); + + return ArrayAttr::get(op.getContext(), template_args); + } + return ArrayAttr(); + } + LogicalResult matchAndRewrite(SourceOp op, Adaptor adaptor, ConversionPatternRewriter &rewriter) const final { @@ -199,7 +225,7 @@ class TTMetalToEmitCOpaqueRewriter : public OpConversionPattern { resultTypes.push_back(ct); } rewriter.replaceOpWithNewOp( - op, resultTypes, getOpName(op), nullptr, nullptr, + op, resultTypes, getOpName(op), nullptr, getTemplateArgs(op), adaptor.getOperands()); return success(); } @@ -258,41 +284,51 @@ class ConvertTTKernelToEmitCPass target.addLegalOp(); target.addIllegalDialect(); - patterns - .add, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter, - TTMetalToEmitCOpaqueRewriter>( - typeConverter, funcOp.getContext()); + patterns.add< + TTMetalToEmitCFuncArgsRewriter, TTMetalToEmitCReturnRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter< + ttkernel::NocSemaphoreSetMulticastLoopbackOp>, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter, + TTMetalToEmitCOpaqueRewriter>( + typeConverter, funcOp.getContext()); if (failed(applyFullConversion(funcOp, target, std::move(patterns)))) { signalPassFailure(); diff --git a/lib/Dialect/TTMetal/IR/TTMetalOps.cpp b/lib/Dialect/TTMetal/IR/TTMetalOps.cpp index 49baf51e01..7ddd4671eb 100644 --- a/lib/Dialect/TTMetal/IR/TTMetalOps.cpp +++ b/lib/Dialect/TTMetal/IR/TTMetalOps.cpp @@ -89,8 +89,9 @@ ::mlir::LogicalResult DispatchOp::verify() { // Assert block inputs are CBs for (auto ®ion : getRegions()) { for (auto arg : region.getArguments()) { - if (not mlir::isa(arg.getType())) { - return emitOpError("Block inputs must be CBType"); + if (not(mlir::isa(arg.getType()) || + mlir::isa(arg.getType()))) { + return emitOpError("Block inputs must be CBType or SemType"); } } } diff --git a/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp b/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp index 6e1b5fb39d..b622392c0b 100644 --- a/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp +++ b/lib/Target/TTMetal/TTMetalToFlatbuffer.cpp @@ -149,6 +149,15 @@ cbTypeToFlatbuffer(FlatbufferObjectCache &cache, ttkernel::CBType cbType) { memref, cbType.getPageSize(), cbType.getNumBuffers()); } +std::pair<::tt::target::metal::RuntimeArg, ::flatbuffers::Offset> +toFlatbuffer(FlatbufferObjectCache &cache, ttkernel::SemaphoreType sem) { + auto runtimeArgType = + ::tt::target::metal::RuntimeArg::RuntimeArgSemaphoreAddress; + auto semAddr = ::tt::target::metal::CreateRuntimeArgSemaphoreAddress( + *cache.fbb, sem.getInitialValue()); + return std::make_pair(runtimeArgType, semAddr.Union()); +} + std::pair<::tt::target::metal::HostBuffer, ::flatbuffers::Offset> hostBufferToFlatbuffer(FlatbufferObjectCache &cache, ElementsAttr elementsAttr) { @@ -239,14 +248,26 @@ static std::shared_ptr translateModuleToFlatbuffer(Operation *op) { toFlatbuffer(mlir::cast( dispatchOp.getCoreRanges()[region.getRegionNumber()]))}; std::vector<::flatbuffers::Offset<::tt::target::CBRef>> cbs; + std::vector<::tt::target::metal::RuntimeArg> runtime_args_type; + std::vector<::flatbuffers::Offset> runtime_args; for (auto arg : region.getArguments()) { - assert(arg.getArgNumber() < operands.size()); - auto cbType = mlir::cast(arg.getType()); - auto cbDesc = cache.getOrCreate(cbType, cbTypeToFlatbuffer); - auto tensorRef = operands[arg.getArgNumber()]; - cbs.push_back( - ::tt::target::CreateCBRef(fbb, cache.global_id++, tensorRef, - cbType.getAddress(), cbDesc)); + if (mlir::isa(arg.getType())) { + auto cbType = mlir::cast(arg.getType()); + auto cbDesc = cache.getOrCreate(cbType, cbTypeToFlatbuffer); + auto tensorRef = operands[arg.getArgNumber()]; + cbs.push_back( + ::tt::target::CreateCBRef(fbb, cache.global_id++, tensorRef, + cbType.getAddress(), cbDesc)); + } else if (mlir::isa(arg.getType())) { + auto semType = mlir::cast(arg.getType()); + auto [runtime_arg_type, runtime_arg] = + toFlatbuffer(cache, semType); + runtime_args_type.push_back(runtime_arg_type); + runtime_args.push_back(runtime_arg); + } else { + llvm_unreachable( + "Block arguments must be either CBType or SemaphoreType"); + } } std::string &source = cppKernels[region.getRegionNumber()]; @@ -263,7 +284,7 @@ static std::shared_ptr translateModuleToFlatbuffer(Operation *op) { ::tt::target::metal::CreateKernelSourceDirect( fbb, source.c_str(), kernelConfigType, kernelConfigUnion) .Union(), - &coreRangeSet, &cbs, nullptr, nullptr, /* TODO rtargs*/ + &coreRangeSet, &cbs, &runtime_args_type, &runtime_args, nullptr /*TODO debug info*/)); } ::flatbuffers::Offset<::tt::target::metal::ProgramDesc> program =