From 6a219f20471133604c523a09ea1f75152d4e3c66 Mon Sep 17 00:00:00 2001 From: Stefan Djordjevic Date: Mon, 27 Jan 2025 17:14:04 +0000 Subject: [PATCH] Adding ttnn_to_dtype op in TTNN dialect --- include/ttmlir/Dialect/TTNN/IR/TTNNOps.td | 15 ++ include/ttmlir/Target/TTNN/program.fbs | 7 + lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp | 28 ++ .../TTNN/Transforms/TTNNDecomposeLayouts.cpp | 219 +++++++++++----- lib/Target/TTNN/TTNNToFlatbuffer.cpp | 16 ++ .../include/tt/runtime/detail/workarounds.h | 14 +- runtime/lib/common/workarounds.cpp | 4 +- runtime/lib/ttnn/operations/CMakeLists.txt | 1 + .../lib/ttnn/operations/layout/to_dtype.cpp | 21 ++ runtime/lib/ttnn/operations/layout/to_dtype.h | 15 ++ .../lib/ttnn/operations/layout/typecast.cpp | 8 +- runtime/lib/ttnn/program.cpp | 4 + test/lit.cfg.py | 6 + test/lit.site.cfg.py.in | 1 + .../decomposing_layouts_from_host.mlir | 237 +++++++++++++++++ test/ttmlir/Dialect/TTNN/simple_clamp.mlir | 3 +- test/ttmlir/EmitC/TTNN/other/embedding.mlir | 3 - .../create_system_desc_device.mlir | 4 + .../decomposing_layouts_from_host.mlir | 240 ++++++++++++++++++ .../TTNN/perf_unit/test_perf_clamp.mlir | 3 +- test/ttmlir/Silicon/TTNN/simple_eltwise.mlir | 3 +- ...ract-and-replace-system-desc-and-device.py | 46 ++++ 22 files changed, 802 insertions(+), 96 deletions(-) create mode 100644 runtime/lib/ttnn/operations/layout/to_dtype.cpp create mode 100644 runtime/lib/ttnn/operations/layout/to_dtype.h create mode 100644 test/ttmlir/Dialect/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir create mode 100644 test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir create mode 100644 test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir create mode 100644 tools/scripts/extract-and-replace-system-desc-and-device.py diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td index 8a2e252c1f..b60e897c98 100644 --- a/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td +++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOps.td @@ -78,6 +78,21 @@ def TTNN_TypecastOp : TTNN_Op<"typecast"> { let results = (outs AnyRankedTensor:$result); } +def TTNN_ToDTypeOp : TTNN_Op<"to_dtype"> { + let summary = "ToDType op."; + let description = [{ + This op converts the data type of the input tensor based on the given data type on the host. + + Args: + - :attr:`input`: the ttnn.Tensor + - :attr:`dtype`: `ttnn` data type. + }]; + + let arguments = (ins AnyRankedTensor:$input, + TT_DataTypeAttr:$dtype); + let results = (outs AnyRankedTensor:$result); +} + def TTNN_ToDeviceOp : TTNN_Op<"to_device"> { let summary = "ToDevice op."; let description = [{ diff --git a/include/ttmlir/Target/TTNN/program.fbs b/include/ttmlir/Target/TTNN/program.fbs index ecb5c6de54..c11dfa563c 100644 --- a/include/ttmlir/Target/TTNN/program.fbs +++ b/include/ttmlir/Target/TTNN/program.fbs @@ -24,6 +24,12 @@ table ToLayoutOp { out: tt.target.TensorRef; } +table ToDTypeOp { + in: tt.target.TensorRef; + dtype: tt.target.DataType; + out: tt.target.TensorRef; +} + table TypecastOp { in: tt.target.TensorRef; dtype: tt.target.DataType; @@ -396,6 +402,7 @@ union OpType { GetDeviceOp, ToMemoryConfigOp, ToLayoutOp, + ToDTypeOp, TypecastOp, ToDeviceOp, FromDeviceOp, diff --git a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp index a22d1e4c6b..b9417d4d82 100644 --- a/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp +++ b/lib/Conversion/TTNNToEmitC/TTNNToEmitC.cpp @@ -735,6 +735,33 @@ class TypecastOpConversionPattern }; } // namespace +// ToDTypeOp conversion pattern +// +namespace { +class ToDTypeOpConversionPattern + : public TTNNToEmitCBaseOpConversionPattern { + +public: + using TTNNToEmitCBaseOpConversionPattern< + ttnn::ToDTypeOp>::TTNNToEmitCBaseOpConversionPattern; + + LogicalResult + matchAndRewrite(ttnn::ToDTypeOp srcOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + ArrayAttr arrayAttrs = rewriter.getArrayAttr( + {mlir::IntegerAttr::get(rewriter.getIndexType(), 0), + ttnn_to_emitc::utils::convertDType(rewriter, srcOp.getDtypeAttr())}); + + rewriter.replaceOpWithNewOp( + srcOp, this->getTypeConverter()->convertType(srcOp.getType()), + this->convertOpName(srcOp), arrayAttrs, nullptr, adaptor.getOperands()); + + return success(); + } +}; +} // namespace + // ToMemoryConfig conversion pattern // namespace { @@ -1128,6 +1155,7 @@ void populateTTNNToEmitCPatterns(mlir::MLIRContext *ctx, // clang-format off patterns.add, TypecastOpConversionPattern, ToDeviceOpConversionPattern, FromDeviceOpConversionPattern, diff --git a/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp b/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp index 2bf4d90085..b988c96b3a 100644 --- a/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp +++ b/lib/Dialect/TTNN/Transforms/TTNNDecomposeLayouts.cpp @@ -2,8 +2,11 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h" +#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h" #include "ttmlir/Dialect/TTNN/Transforms/Passes.h" #include "ttmlir/Dialect/TTNN/Utils/Utils.h" +#include namespace mlir::tt::ttnn { #define GEN_PASS_DEF_TTNNDECOMPOSELAYOUTS @@ -63,12 +66,12 @@ class TTNNDecomposeLayouts bool createToDeviceOp = false; bool createFromDeviceOp = false; bool createToLayoutOp = false; - bool createTypecastOp = false; + bool createDataTypeCastOp = false; bool createToMemoryConfigOp = false; bool createSomeOp() const { - return createToLayoutOp or createTypecastOp or createToDeviceOp or - createFromDeviceOp or createToMemoryConfigOp; + return createToLayoutOp || createDataTypeCastOp || createToDeviceOp || + createFromDeviceOp || createToMemoryConfigOp; } void print() const { @@ -80,7 +83,7 @@ class TTNNDecomposeLayouts << "\t" << "CreateToLayoutOp: " << createToLayoutOp << "\n" << "\t" - << "CreateTypecastOp: " << createTypecastOp << "\n" + << "CreateTypecastOp: " << createDataTypeCastOp << "\n" << "\t" << "CreateToMemoryConfigOp: " << createToMemoryConfigOp << "\n" @@ -146,7 +149,7 @@ class TTNNDecomposeLayouts opsToCreate.createFromDeviceOp = (input.bufferType != output.bufferType) and output.isOnHost(); - opsToCreate.createTypecastOp = input.dataType != output.dataType; + opsToCreate.createDataTypeCastOp = input.dataType != output.dataType; opsToCreate.createToLayoutOp = input.layoutEnum != output.layoutEnum; // TODO(bug #665): // Insert a ToLayoutOp manually if we're moving from device to host to @@ -286,30 +289,58 @@ class TTNNDecomposeLayouts RankedTensorType newResultType = utils::createRankedTensorTypeWithElementType(inputType, memrefElementType); + + TTNNLayoutAttr inputLayout = + mlir::cast(inputType.getEncoding()); + return this->createOp( rewriter, op, newResultType, currentInput, layoutAttr, /*dtype*/ nullptr, - /*memory_config*/ nullptr, /*device*/ nullptr); + /*memory_config*/ nullptr, + inputLayout.isSystemBufferType() ? nullptr : info.device); } - mlir::Value createTypecastOpIfNeeded(ttnn::ToLayoutOp op, - IRRewriter &rewriter, - mlir::Value currentInput, - const OpCreationInfo &info) const { - if (not info.opsToCreate.createTypecastOp) { - return currentInput; - } + template + mlir::Value createDataTypeCastingOp(ttnn::ToLayoutOp op, IRRewriter &rewriter, + mlir::Value currentInput, + const OpCreationInfo &info) const { DataTypeAttr dtypeAttr = DataTypeAttr::get(op.getContext(), info.output.dataType); RankedTensorType currentInputType = mlir::cast(currentInput.getType()); + TTNNLayoutAttr currentInputLayout = + mlir::cast(currentInputType.getEncoding()); Type nmemrefElementType = utils::getElementType( - op.getContext(), info.input.layoutEnum, info.output.dataType); + op.getContext(), currentInputLayout.getLayout(), info.output.dataType); RankedTensorType newResultType = utils::createRankedTensorTypeWithElementType(currentInputType, nmemrefElementType); - return this->createOp(rewriter, op, newResultType, - currentInput, dtypeAttr); + return this->createOp(rewriter, op, newResultType, currentInput, + dtypeAttr); + } + + mlir::Value + createDataTypeCastingOpIfNeeded(ttnn::ToLayoutOp op, IRRewriter &rewriter, + mlir::Value currentInput, + const OpCreationInfo &info) const { + if (!info.opsToCreate.createDataTypeCastOp) { + return currentInput; + } + + RankedTensorType currentInputType = + mlir::cast(currentInput.getType()); + + TTNNLayoutAttr inputLayout = + mlir::cast(currentInputType.getEncoding()); + if (inputLayout.isSystemBufferType()) { + // If the input tensor is on host, we need to cast it on the host + return this->createDataTypeCastingOp(op, rewriter, + currentInput, info); + } + + // If the input tensor is on device, we can cast it on the device. + return this->createDataTypeCastingOp(op, rewriter, + currentInput, info); } mlir::Value createToMemoryConfigOpIfNeeded(ttnn::ToLayoutOp op, @@ -350,7 +381,19 @@ class TTNNDecomposeLayouts const LayoutInfo &output = info.output; assert(input.dataType == output.dataType && "Data type should be the same if we're not creating typecast op"); - /* if we should untilize, untilize on host */ + + // If the output is on the host, we can perform layout conversion on host. + if (output.isOnHost()) { + currentInput = + this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter, + currentInput, info); + op.getResult().replaceAllUsesWith(currentInput); + return; + } + + // If the output is on device and we should untilize, we can untilize on + // host and than move the tensor to device. if (info.shouldUntilize()) { currentInput = this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info); @@ -362,9 +405,10 @@ class TTNNDecomposeLayouts return; } - /* If we should tilize, and the data type is bfloat16, we can tilize on - * device */ - if (info.shouldTilize() and output.dataType == DataType::BFloat16) { + // Tilizing on device is supported only for bf16 data format. If the tensor + // is bf16 and the output is on device, we can move the tensor to device and + // perform the tilization on device. + if (info.shouldTilize() && output.dataType == DataType::BFloat16) { currentInput = this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info); currentInput = @@ -375,9 +419,9 @@ class TTNNDecomposeLayouts return; } - /* If we should tilize, and the data type is not bfloat16, we tilize on host - */ - if (info.shouldTilize() and output.dataType != DataType::BFloat16) { + // Otherwise, if tensor is not in bf16 data format, we perform tilizing on + // host and than move the tensor to device. + if (info.shouldTilize() && output.dataType != DataType::BFloat16) { currentInput = this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info); currentInput = @@ -400,24 +444,40 @@ class TTNNDecomposeLayouts assert(input.layoutEnum == output.layoutEnum && "Layout should be the same if we're not creating a ToLayoutOp"); - /* If the output is already tilized, we can typecast on device */ - if (output.isTilized()) { + // If the output is on the host, we can perform the data type cast directly + // on the host. + if (output.isOnHost()) { + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); + currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter, + currentInput, info); + op.getResult().replaceAllUsesWith(currentInput); + return; + } + + // Device typecast only supports tilized tensors. Therefore, if the output + // tensor is in row-major (input as well is in row-major) and resides on the + // device, we should perform the data type casting on the host before moving + // the tensor back to the device. + if (!output.isTilized()) { + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info); - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter, currentInput, info); op.getResult().replaceAllUsesWith(currentInput); return; } - /* If the output is not tilized, typecast on host */ - if (not output.isTilized()) { - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + // If the output tensor is tilized and resides on the device, we can move + // the tensor to the device and perform the data type cast directly on the + // device. + if (output.isTilized()) { currentInput = this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter, currentInput, info); op.getResult().replaceAllUsesWith(currentInput); @@ -433,11 +493,26 @@ class TTNNDecomposeLayouts const LayoutInfo &input = info.input; const LayoutInfo &output = info.output; - /* If we need to untilize and typecast, then untilize and typecast on host - */ - if (info.shouldUntilize()) { + // If the output tensor is on host, we can perform the data type cast and + // layout conversion on host. + if (output.isOnHost()) { + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter, + currentInput, info); + op.getResult().replaceAllUsesWith(currentInput); + return; + } + + // Untilize is only supported on the host, and typecast is only supported on + // the device for tilized tensors. Therefore, we need to untilize and change + // the tensor data type format on the host before moving the tensor to the + // device. + if (info.shouldUntilize()) { + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info); currentInput = @@ -448,26 +523,29 @@ class TTNNDecomposeLayouts return; } - /* If we need to tilize and the input datatype is bfloat16 - we can tilize on device and then typecast afterwards */ - if (info.shouldTilize() and input.dataType == DataType::BFloat16) { + // If we need to tilize and change the data type from bf16 to another + // format, we can move the tensor to the device, perform the tilization, and + // then cast the data type on the device since tilization is supported for + // bf16 on the device. + if (info.shouldTilize() && input.dataType == DataType::BFloat16) { currentInput = this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info); currentInput = this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info); - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter, currentInput, info); op.getResult().replaceAllUsesWith(currentInput); return; } - /* if we need to tilize and the output data type is bfloat16 - we can typecast on host and tilize on device */ + // If we need to tilize and change the data type format from another format + // to bf16, we can cast the data type on the host, move the tensor to the + // device, and then perform the tilization. if (info.shouldTilize() and output.dataType == DataType::BFloat16) { - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createToDeviceOpIfNeeded(op, rewriter, currentInput, info); currentInput = @@ -482,8 +560,8 @@ class TTNNDecomposeLayouts * everything on host */ if (info.shouldTilize() and input.dataType != DataType::BFloat16 and output.dataType != DataType::BFloat16) { - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info); currentInput = @@ -502,17 +580,17 @@ class TTNNDecomposeLayouts mlir::Value currentInput, const OpCreationInfo &info) const { const OpsToCreate &opsToCreate = info.opsToCreate; - if (not opsToCreate.createToLayoutOp and not opsToCreate.createTypecastOp) { + if (!opsToCreate.createToLayoutOp && !opsToCreate.createDataTypeCastOp) { return handleHostInputNoLayoutNoTypecast(op, rewriter, currentInput, info); } - if (opsToCreate.createToLayoutOp and not opsToCreate.createTypecastOp) { + if (opsToCreate.createToLayoutOp && !opsToCreate.createDataTypeCastOp) { return handleHostInputLayoutNoTypecast(op, rewriter, currentInput, info); } - if (not opsToCreate.createToLayoutOp and opsToCreate.createTypecastOp) { + if (!opsToCreate.createToLayoutOp && opsToCreate.createDataTypeCastOp) { return handleHostInputNoLayoutTypecast(op, rewriter, currentInput, info); } - if (opsToCreate.createToLayoutOp and opsToCreate.createTypecastOp) { + if (opsToCreate.createToLayoutOp && opsToCreate.createDataTypeCastOp) { return handleHostInputLayoutTypecast(op, rewriter, currentInput, info); } llvm_unreachable("Unreachable code path"); @@ -630,8 +708,8 @@ class TTNNDecomposeLayouts /* If the output is tilized, typecast directly on device*/ if (output.isTilized()) { - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter, currentInput, info); currentInput = @@ -644,8 +722,8 @@ class TTNNDecomposeLayouts if (not output.isTilized() and opsToCreate.createFromDeviceOp) { currentInput = this->createFromDeviceOpIfNeeded(op, rewriter, currentInput, info); - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); op.getResult().replaceAllUsesWith(currentInput); return; } @@ -656,8 +734,8 @@ class TTNNDecomposeLayouts currentInput = this->createOp(op, rewriter, currentInput); // typecast on host - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); // move back to device and convert memory config if needed currentInput = this->createOp( op, rewriter, currentInput, info.device, @@ -680,8 +758,8 @@ class TTNNDecomposeLayouts /* If we need to untilize, typecast on device and untilize on host */ if (info.shouldUntilize() and opsToCreate.createFromDeviceOp) { - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createFromDeviceOpIfNeeded(op, rewriter, currentInput, info); currentInput = @@ -694,8 +772,8 @@ class TTNNDecomposeLayouts * host, move back to device */ if (info.shouldUntilize() and not opsToCreate.createFromDeviceOp) { // typecast on device - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); // Force-create a FromDeviceOp currentInput = this->createFromDeviceOpIfNeeded( op, rewriter, currentInput, info, true /* forceCreate */); @@ -714,8 +792,8 @@ class TTNNDecomposeLayouts if (info.shouldTilize() and input.dataType == DataType::BFloat16) { currentInput = this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info); - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter, currentInput, info); currentInput = @@ -732,8 +810,8 @@ class TTNNDecomposeLayouts this->createFromDeviceOpIfNeeded(op, rewriter, currentInput, info); currentInput = this->createToLayoutOpIfNeeded(op, rewriter, currentInput, info); - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); op.getResult().replaceAllUsesWith(currentInput); return; } @@ -753,8 +831,8 @@ class TTNNDecomposeLayouts currentInput = this->createOp( op, rewriter, currentInput, info.device, /* optional MemConfigAttr */ nullptr); - currentInput = - this->createTypecastOpIfNeeded(op, rewriter, currentInput, info); + currentInput = this->createDataTypeCastingOpIfNeeded(op, rewriter, + currentInput, info); currentInput = this->createToMemoryConfigOpIfNeeded(op, rewriter, currentInput, info); op.getResult().replaceAllUsesWith(currentInput); @@ -769,19 +847,20 @@ class TTNNDecomposeLayouts mlir::Value currentInput, const OpCreationInfo &info) const { const OpsToCreate &opsToCreate = info.opsToCreate; - if (not opsToCreate.createToLayoutOp and not opsToCreate.createTypecastOp) { + if (not opsToCreate.createToLayoutOp and + not opsToCreate.createDataTypeCastOp) { handleDeviceInputNoLayoutNoTypecast(op, rewriter, currentInput, info); return; } - if (opsToCreate.createToLayoutOp and not opsToCreate.createTypecastOp) { + if (opsToCreate.createToLayoutOp and not opsToCreate.createDataTypeCastOp) { handleDeviceInputLayoutNoTypecast(op, rewriter, currentInput, info); return; } - if (not opsToCreate.createToLayoutOp and opsToCreate.createTypecastOp) { + if (not opsToCreate.createToLayoutOp and opsToCreate.createDataTypeCastOp) { handleDeviceInputNoLayoutTypecast(op, rewriter, currentInput, info); return; } - if (opsToCreate.createToLayoutOp and opsToCreate.createTypecastOp) { + if (opsToCreate.createToLayoutOp and opsToCreate.createDataTypeCastOp) { handleDeviceInputLayoutTypecast(op, rewriter, currentInput, info); return; } diff --git a/lib/Target/TTNN/TTNNToFlatbuffer.cpp b/lib/Target/TTNN/TTNNToFlatbuffer.cpp index d45bea5636..4538e6d774 100644 --- a/lib/Target/TTNN/TTNNToFlatbuffer.cpp +++ b/lib/Target/TTNN/TTNNToFlatbuffer.cpp @@ -225,6 +225,18 @@ createOp(FlatbufferObjectCache &cache, ToLayoutOp op) { device ? cache.at<::tt::target::DeviceRef>(device) : 0, output); } +::flatbuffers::Offset<::tt::target::ttnn::ToDTypeOp> +createOp(FlatbufferObjectCache &cache, ToDTypeOp op) { + auto input = + cache.at<::tt::target::TensorRef>(getOperandThroughDPSOps(op.getInput())); + ::tt::target::DataType dtype = + ::tt::mlir::ttnn::utils::toTargetDataType(op.getDtype()); + auto output = cache.getOrCreate(op.getResult(), tensorValueToFlatbuffer, + kHostAllocatedAddress, kHostAllocatedSize); + + return ::tt::target::ttnn::CreateToDTypeOp(*cache.fbb, input, dtype, output); +} + ::flatbuffers::Offset<::tt::target::ttnn::TypecastOp> createOp(FlatbufferObjectCache &cache, TypecastOp op) { auto input = @@ -1032,6 +1044,10 @@ emitTTNNOperation(FlatbufferObjectCache &cache, Operation *op, return createOperation(cache, createOp(cache, toLayoutOp), debugString, locInfo); } + if (auto toDTypeOp = dyn_cast(op); toDTypeOp) { + return createOperation(cache, createOp(cache, toDTypeOp), debugString, + locInfo); + } if (auto typecastOp = dyn_cast(op); typecastOp) { return createOperation(cache, createOp(cache, typecastOp), debugString, locInfo); diff --git a/runtime/include/tt/runtime/detail/workarounds.h b/runtime/include/tt/runtime/detail/workarounds.h index 50a12ea108..e33def97ad 100644 --- a/runtime/include/tt/runtime/detail/workarounds.h +++ b/runtime/include/tt/runtime/detail/workarounds.h @@ -17,12 +17,12 @@ struct Env { #endif get(bool maxpool2dPreshard = true, bool swapBinaryOperands = true, bool readUpdateIndexFromDeviceForKVCache = true, - bool toDtypeOnHost = true, bool defaultStrideComputation = true) + bool defaultStrideComputation = true) #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1 ; #else { - return Env(true, true, true, true, true); + return Env(true, true, true, true); } #endif // TODO(bug #855): Ideally we should have an op that preshards for maxpool2d @@ -40,11 +40,6 @@ struct Env { // to be able to pluck this update index from a runtime tensor. bool readUpdateIndexFromDeviceForKVCache; - // TODO(bug #1658): We're currently use ttnn::to_dtype operation to cast the - // data type of a tensor on host. Once we have improved the typecast operation - // to handle this, we should remove this workaround. - bool toDtypeOnHost; - // TODO(bug #2045): Our current stride calculation is incorrect for tilized // tensors. The current solution is to remove stride entirely from the // flatbuffer and calculate the stride in runtime assuming using the default @@ -54,13 +49,12 @@ struct Env { private: constexpr Env(bool maxpool2dPreshard, bool swapBinaryOperands, - bool readUpdateIndexFromDeviceForKVCache, bool toDtypeOnHost, + bool readUpdateIndexFromDeviceForKVCache, bool defaultStrideComputation) : maxpool2dPreshard(maxpool2dPreshard), swapBinaryOperands(swapBinaryOperands), readUpdateIndexFromDeviceForKVCache( readUpdateIndexFromDeviceForKVCache), - toDtypeOnHost(toDtypeOnHost), defaultStrideComputation(defaultStrideComputation) {} }; @@ -73,8 +67,6 @@ inline std::ostream &operator<<(std::ostream &os, const Env &env) { os << "\t" << "readUpdateIndexFromDeviceForKVCache: " << env.readUpdateIndexFromDeviceForKVCache << "\n"; - os << "\t" - << "toDtypeOnHost: " << env.toDtypeOnHost << "\n"; os << "\t" << "defaultStrideComputation: " << env.defaultStrideComputation << "\n"; os << "}"; diff --git a/runtime/lib/common/workarounds.cpp b/runtime/lib/common/workarounds.cpp index 5396b8a8c1..bc0481ff62 100644 --- a/runtime/lib/common/workarounds.cpp +++ b/runtime/lib/common/workarounds.cpp @@ -8,9 +8,9 @@ namespace tt::runtime::workaround { #if defined(TT_RUNTIME_WORKAROUNDS) && TT_RUNTIME_WORKAROUNDS == 1 const Env &Env::get(bool maxpool2dPreshard, bool swapBinaryOperands, bool readUpdateIndexFromDeviceForKVCache, - bool toDtypeOnHost, bool defaultStrideComputation) { + bool defaultStrideComputation) { static const Env config(maxpool2dPreshard, swapBinaryOperands, - readUpdateIndexFromDeviceForKVCache, toDtypeOnHost, + readUpdateIndexFromDeviceForKVCache, defaultStrideComputation); return config; } diff --git a/runtime/lib/ttnn/operations/CMakeLists.txt b/runtime/lib/ttnn/operations/CMakeLists.txt index 29d3ab8255..87be4740c4 100644 --- a/runtime/lib/ttnn/operations/CMakeLists.txt +++ b/runtime/lib/ttnn/operations/CMakeLists.txt @@ -32,6 +32,7 @@ set(TTNN_OPS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layout/from_device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_layout.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_dtype.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layout/typecast.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layout/to_memory_config.cpp # ANCHOR: adding_an_op_matmul_runtime_cmake diff --git a/runtime/lib/ttnn/operations/layout/to_dtype.cpp b/runtime/lib/ttnn/operations/layout/to_dtype.cpp new file mode 100644 index 0000000000..4b69bd5846 --- /dev/null +++ b/runtime/lib/ttnn/operations/layout/to_dtype.cpp @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "operations/layout/to_dtype.h" +#include "tt/runtime/ttnn/utils.h" + +namespace tt::runtime::ttnn::operations::layout { + +void run(const ::tt::target::ttnn::ToDTypeOp *op, ProgramContext &context) { + ProgramTensorPool &tensorPool = context.getTensorPool(); + const ::ttnn::Tensor &inputTensor = tensorPool.at(op->in()->global_id()); + + ::ttnn::DataType targetDataType = + ::tt::runtime::ttnn::utils::toTTNNDataType(op->dtype()); + + ::ttnn::Tensor out = ::ttnn::to_dtype(inputTensor, targetDataType); + + tensorPool.insert_or_assign(op->out()->global_id(), out); +} +} // namespace tt::runtime::ttnn::operations::layout diff --git a/runtime/lib/ttnn/operations/layout/to_dtype.h b/runtime/lib/ttnn/operations/layout/to_dtype.h new file mode 100644 index 0000000000..91a110979a --- /dev/null +++ b/runtime/lib/ttnn/operations/layout/to_dtype.h @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef RUNTIME_LIB_TTNN_OPERATIONS_LAYOUT_TO_DTYPE_H +#define RUNTIME_LIB_TTNN_OPERATIONS_LAYOUT_TO_DTYPE_H + +#include "tt/runtime/ttnn/types.h" +#include "ttmlir/Target/TTNN/program_generated.h" + +namespace tt::runtime::ttnn::operations::layout { +void run(const ::tt::target::ttnn::ToDTypeOp *op, ProgramContext &context); +} // namespace tt::runtime::ttnn::operations::layout + +#endif diff --git a/runtime/lib/ttnn/operations/layout/typecast.cpp b/runtime/lib/ttnn/operations/layout/typecast.cpp index a76f5a2987..f46299fd18 100644 --- a/runtime/lib/ttnn/operations/layout/typecast.cpp +++ b/runtime/lib/ttnn/operations/layout/typecast.cpp @@ -18,13 +18,7 @@ void run(const ::tt::target::ttnn::TypecastOp *op, ProgramContext &context) { ::ttnn::DataType targetDataType = ::tt::runtime::ttnn::utils::toTTNNDataType(op->dtype()); - ::ttnn::Tensor out; - if (workaround::Env::get().toDtypeOnHost && - ::tt::runtime::ttnn::utils::isOnHost(inputTensor.storage_type())) { - out = ::ttnn::to_dtype(inputTensor, targetDataType); - } else { - out = ::ttnn::typecast(inputTensor, targetDataType); - } + ::ttnn::Tensor out = ::ttnn::typecast(inputTensor, targetDataType); tensorPool.insert_or_assign(op->out()->global_id(), out); } diff --git a/runtime/lib/ttnn/program.cpp b/runtime/lib/ttnn/program.cpp index 9de6ddb009..44b1459a7d 100644 --- a/runtime/lib/ttnn/program.cpp +++ b/runtime/lib/ttnn/program.cpp @@ -30,6 +30,7 @@ #include "operations/kv_cache/update_cache.h" #include "operations/layout/from_device.h" #include "operations/layout/to_device.h" +#include "operations/layout/to_dtype.h" #include "operations/layout/to_layout.h" #include "operations/layout/to_memory_config.h" #include "operations/layout/typecast.h" @@ -163,6 +164,9 @@ void ProgramExecutor::runOperation(const ::tt::target::ttnn::Operation *op) { case ::tt::target::ttnn::OpType::ToLayoutOp: { return operations::layout::run(op->type_as_ToLayoutOp(), context); } + case ::tt::target::ttnn::OpType::ToDTypeOp: { + return operations::layout::run(op->type_as_ToDTypeOp(), context); + } case ::tt::target::ttnn::OpType::TypecastOp: { return operations::layout::run(op->type_as_TypecastOp(), context); } diff --git a/test/lit.cfg.py b/test/lit.cfg.py index 886d5e558d..92d5e59e7c 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -90,6 +90,12 @@ def set_system_desc_features(system_desc): config.substitutions.append(("%ttmlir_libs", config.ttmlir_libs_dir)) +config.test_root = os.path.join(config.ttmlir_source_dir, "test") +config.scripts_root = os.path.join(config.ttmlir_source_dir, "tools/scripts") + +config.substitutions.append(("%ttmlir_test_root", config.test_root)) +config.substitutions.append(("%ttmlir_scripts_root", config.scripts_root)) + # Tweak the PATH to include the tools dir. llvm_config.with_environment("PATH", config.llvm_tools_dir, append_path=True) diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in index 7c3b1928a3..bca25b3e01 100644 --- a/test/lit.site.cfg.py.in +++ b/test/lit.site.cfg.py.in @@ -5,6 +5,7 @@ config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" config.mlir_obj_dir = "@MLIR_BINARY_DIR@" config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@ and "@TTMLIR_ENABLE_BINDINGS_PYTHON@" == "ON" config.ttmlir_obj_root = "@TTMLIR_BINARY_DIR@" +config.ttmlir_source_dir = "@TTMLIR_SOURCE_DIR@" config.llvm_shlib_ext = "@SHLIBEXT@" config.enable_stablehlo = "@TTMLIR_ENABLE_STABLEHLO@" and "@TTMLIR_ENABLE_STABLEHLO@" == "ON" config.enable_pykernel = "@TTMLIR_ENABLE_PYKERNEL@" and "@TTMLIR_ENABLE_PYKERNEL@" == "ON" diff --git a/test/ttmlir/Dialect/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir b/test/ttmlir/Dialect/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir new file mode 100644 index 0000000000..93fc61a72d --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir @@ -0,0 +1,237 @@ +// RUN: ttmlir-opt --ttnn-decompose-layouts %s | FileCheck %s +#device = #tt.device (0, d0, d1)>, l1Map = (d0, d1)[s0, s1] -> (0, d0 floordiv s0, d1 floordiv s1, (d0 mod s0) * s1 + d1 mod s1), dramMap = (d0, d1)[s0, s1] -> (0, 0, ((((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 8192) mod 12, (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) floordiv 98304 + (((d0 floordiv s0) * 8 + d1 floordiv s1) * (s1 * s0) + (d0 mod s0) * s1 + d1 mod s1) mod 8192), meshShape = , chipIds = [0]> +#system_desc = #tt.system_desc<[{role = host, target_triple = "x86_64-pc-linux"}], [{arch = , grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 99104, erisc_l1_unreserved_base = 104480, dram_unreserved_base = 32, dram_unreserved_end = 1073196736, physical_cores = {worker = [ 18x18, 18x19, 18x20, 18x21, 18x22, 18x23, 18x24, 18x25, 19x18, 19x19, 19x20, 19x21, 19x22, 19x23, 19x24, 19x25, 20x18, 20x19, 20x20, 20x21, 20x22, 20x23, 20x24, 20x25, 21x18, 21x19, 21x20, 21x21, 21x22, 21x23, 21x24, 21x25, 22x18, 22x19, 22x20, 22x21, 22x22, 22x23, 22x24, 22x25, 23x18, 23x19, 23x20, 23x21, 23x22, 23x23, 23x24, 23x25, 24x18, 24x19, 24x20, 24x21, 24x22, 24x23, 24x24, 24x25, 25x18, 25x19, 25x20, 25x21, 25x22, 25x23, 25x24, 25x25] dram = [ 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x10, 0x11] eth = [ 17x25] eth_inactive = [ 16x18, 16x19, 16x20, 16x21, 16x22, 16x23, 16x24, 16x25, 17x19, 17x20, 17x22, 17x23, 17x24]}, supported_data_types = [, , , , , , , , , , , ], supported_tile_sizes = [ 4x16, 16x16, 32x16, 4x32, 16x32, 32x32], num_cbs = 32}, {arch = , grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 99104, erisc_l1_unreserved_base = 104480, dram_unreserved_base = 32, dram_unreserved_end = 1073196736, physical_cores = {worker = [ 18x18, 18x19, 18x20, 18x21, 18x22, 18x23, 18x24, 18x25, 19x18, 19x19, 19x20, 19x21, 19x22, 19x23, 19x24, 19x25, 20x18, 20x19, 20x20, 20x21, 20x22, 20x23, 20x24, 20x25, 21x18, 21x19, 21x20, 21x21, 21x22, 21x23, 21x24, 21x25, 22x18, 22x19, 22x20, 22x21, 22x22, 22x23, 22x24, 22x25, 23x18, 23x19, 23x20, 23x21, 23x22, 23x23, 23x24, 23x25, 24x18, 24x19, 24x20, 24x21, 24x22, 24x23, 24x24, 24x25, 25x18, 25x19, 25x20, 25x21, 25x22, 25x23, 25x24, 25x25] dram = [ 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x10, 0x11] eth = [ 16x25] eth_inactive = [ 16x19, 16x20, 16x21, 16x22, 16x23, 16x24, 17x18, 17x19, 17x20, 17x21, 17x22, 17x23, 17x24, 17x25]}, supported_data_types = [, , , , , , , , , , , ], supported_tile_sizes = [ 4x16, 16x16, 32x16, 4x32, 16x32, 32x32], num_cbs = 32}], [0, 1], [3 : i32, 0 : i32], [ 0x0x0x0], [<[0, 8, 0], [1, 0, 0]>]> +#dram = #ttnn.buffer_type +#system_memory = #ttnn.buffer_type +#ttnn_layout_host_rm = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system_memory>> +#ttnn_layout_host_rm_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>> +#ttnn_layout_host_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #system_memory>> +#ttnn_layout_host_tile_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #system_memory>> +#ttnn_layout_device_rm = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, > +#ttnn_layout_device_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, > +#ttnn_layout_device_tile_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, > +#ttnn_layout_device_tile_u32 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, u32>, #dram>, > +#ttnn_layout_device_rm_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #dram>, > +#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>> +#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, > +module attributes {tt.device = #device, tt.system_desc = #system_desc} { + + // Test cases when we do layout transformation from host and we don't change tensor layout and tensor data type + // + + // Test case when we move tensor from host to device. + func.func @from_host_to_device_layout_to_layout_dt_to_dt_create_to_device_op(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_device_rm> { + // Verify that we only insert the to_device op when there are no layout or data type changes. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]]) + // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, > + // CHECK: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_device_rm> + } + + // Test cases when we do layout transformation from host and we don't change tensor layout but we cast tensor data type. + // + + // Test case when we move tensor from host to host for tile case. + func.func @from_host_to_host_layout_to_layout_create_data_cast_op_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_host_tile_bf16> { + // Typecast works only on device. Verify that for the tile case when the output is on host, we insert the to_dtype op to cast the data type on host. + // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: return %[[CASTING_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_host_tile_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_host_tile_bf16> + } + + // Test case when we move tensor from host to host for row-major case. + func.func @from_host_to_host_layout_to_layout_create_data_cast_op_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_host_rm_bf16> { + // Typecast works only on device. Verify that for the row-major case when the output is on host, we insert the to_dtype op to cast the data type on host. + // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: return %[[CASTING_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_host_rm_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_host_rm_bf16> + } + + // Test case when we move tensor from host to device for row-major case. + func.func @from_host_to_device_layout_to_layout_create_data_cast_op_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_device_rm_bf16> { + // Typecast on device only works for tile layout. Verify that for the row-major case we insert the to_dtype op to cast the data type on host and than move the tensor to device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[CASTING_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, > + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_rm_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_device_rm_bf16> + } + + // Test case when we move tensor from host to device for tile case. + func.func @from_host_to_device_layout_to_layout_create_data_cast_op_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> { + // Typecast on device only works for tile layout. Verify that for the tile case we insert the to_device op and the typecast op to cast the data type on device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]]) + // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, > + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.typecast"(%[[TO_DEVICE_OP]]) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: return %[[CASTING_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_tile>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + } + + // Test cases when we do layout transformation from host and we change tensor layout but we don't cast tensor data type. + // + + // Test case when we move tensor from host to host for tile -> row-major case. + func.func @from_host_to_host_dt_to_dt_from_tile_to_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_host_rm> { + // This test verifies that the `to_layout` operation is correctly inserted to change the layout from tile to row-major on the host. + // CHECK: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_host_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_host_rm> + } + + // Test case when we move tensor from host to host for row-major -> tile case. + func.func @from_host_to_host_dt_to_dt_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_host_tile> { + // This test verifies that the `to_layout` operation is correctly inserted to change the layout from row-major to tile on the host. + // CHECK: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_host_tile> + return %1 : tensor<64x128xf32, #ttnn_layout_host_tile> + } + + // Test case when we move tensor from host to device for tile -> row-major case. + func.func @from_host_to_device_dt_to_dt_from_tile_to_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_device_rm> { + // This test verifies that the `to_layout` and `to_device` operations are correctly inserted to change the layout from tile to row-major on the host and than move the tensor to the device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]]) + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_tile>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_device_rm> + } + + // Test case when we move tensor from host to device for row-major -> tile case for bf16 data type. + func.func @from_host_to_device_dt_to_dt_from_rm_to_tile_bf16(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> { + // This test verifies that the `to_device` and `to_layout` operations are correctly inserted to change the layout from row-major to tile on the device. + // Specifically, it ensures that BF16 tiling is performed on the device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]]) + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + } + + // Test case when we move tensor from host to device for row-major -> tile case for non-bf16 data type. + func.func @from_host_to_device_dt_to_dt_from_rm_to_tile_f32(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_device_tile> { + // This test verifies that the `to_layout` and `to_device` operations are correctly inserted to change the layout from row-major to tile on the host for non bf16 data type. + // Specifically, it ensures that non-BF16 tiling is performed on the host and then moved to the device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]]) + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_tile> + return %1 : tensor<64x128xf32, #ttnn_layout_device_tile> + } + + // Test cases when we do layout transformation from host and we change both tensor layout and tensor data type. + // + + // Test case when we move tensor from host to host for tile -> row-major case and data type cast. + func.func @from_host_to_host_from_bf16_to_f32_from_tile_to_rm(%arg0: tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_rm> { + // This test verifies that the `to_layout` and `to_dtype` operations are correctly inserted to change the layout from tile to row-major and cast data type from bf16 to f32 on host. + // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_host_rm> + } + + // Test case when we move tensor from host to host for row-major -> tile case and data type cast. + func.func @from_host_to_host_from_bf16_to_f32_from_rm_to_tile(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_tile> { + // This test verifies that the `to_layout` and `to_dtype` operations are correctly inserted to change the layout from row-major to tile and cast data type from bf16 to f32 on host. + // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_tile> + return %1 : tensor<64x128xf32, #ttnn_layout_host_tile> + } + + // Test case when we move tensor from host to device for tile -> row-major case and cast input from bf16. + func.func @from_host_to_device_data_type_from_bf16_to_f32_from_tile_to_rm(%arg0: tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_device_rm> { + // This test verifies that the `to_dtype`, `to_layout` and `to_device` operations are correctly inserted to change the layout from tile to row-major and cast data type from bf16 to f32 on host and then move tensor to device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, > + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_device_rm> + } + + // Test case when we move tensor from host to device for row-major -> tile case and cast input from bf16. + func.func @from_host_to_device_data_type_from_bf16_to_f32_from_rm_to_tile(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_device_tile> { + // This test verifies that the `to_device`, `to_layout` and `typecast` operations are correctly inserted to change the layout from row-major to tile and cast + // data type from bf16 to f32 on device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]]) + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.typecast"(%[[TO_LAYOUT_OP]]) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: return %[[CASTING_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_tile> + return %1 : tensor<64x128xf32, #ttnn_layout_device_tile> + } + + // Test case when we move tensor from host to device for row-major -> tile case and cast input to bf16. + func.func @from_host_to_device_data_type_from_f32_to_bf16_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> { + // This test verifies that the `to_dtype`, `to_device` and `to_layout` operations are correctly inserted to cast the data type from f32 to bf16 on host and then move tensor to device and change the layout from row-major to tile. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[CASTING_OP]], %[[GET_DEVICE_OP]]) + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + } + + // Test case when we move tensor from host to device for row-major -> tile case and we don't cast data type to bf16 nor from bf16. + func.func @from_host_to_device_data_type_from_f32_to_u32_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xi32, #ttnn_layout_device_tile_u32> { + // This test verifies that the `to_dtype`, `to_layout` and `to_device` operations are correctly inserted to cast the data type from f32 to f16 and tilize on host and then move tensor to device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]]) + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xi32, #ttnn_layout_device_tile_u32> + return %1 : tensor<64x128xi32, #ttnn_layout_device_tile_u32> + } +} diff --git a/test/ttmlir/Dialect/TTNN/simple_clamp.mlir b/test/ttmlir/Dialect/TTNN/simple_clamp.mlir index 272e07175b..f341792c74 100644 --- a/test/ttmlir/Dialect/TTNN/simple_clamp.mlir +++ b/test/ttmlir/Dialect/TTNN/simple_clamp.mlir @@ -2,8 +2,9 @@ module attributes {} { func.func @clamp(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> { %0 = tensor.empty() : tensor<64x128xbf16> + // CHECK: %[[GET_DEVICE:.*]] = "ttnn.get_device"() // CHECK: %[[DEVICE:.*]] = "ttnn.to_device"(%arg0, - // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]]) + // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]], %[[GET_DEVICE]]) // CHECK: = "ttnn.clamp"(%[[LAYOUT]]) // CHECK-SAME: {max = 3.000000e+00 : f32, min = 2.000000e+00 : f32} // CHECK-SAME: [[TENSOR:tensor<64x128xbf16]], #ttnn_layout{{[0-9]+}}>) -> [[TENSOR]] diff --git a/test/ttmlir/EmitC/TTNN/other/embedding.mlir b/test/ttmlir/EmitC/TTNN/other/embedding.mlir index b7a42638ac..627b514b06 100644 --- a/test/ttmlir/EmitC/TTNN/other/embedding.mlir +++ b/test/ttmlir/EmitC/TTNN/other/embedding.mlir @@ -3,9 +3,6 @@ // RUN: ttmlir-opt --ttnn-modify-signatures-for-dylib --convert-ttnn-to-emitc %t.mlir > %t2.mlir // RUN: ttmlir-translate --mlir-to-cpp %t2.mlir > %basename_t.cpp -// UNSUPPORTED: true -// Outstanding bug: https://github.com/tenstorrent/tt-mlir/issues/1938 - func.func @embedding(%arg0: tensor<32x32xbf16>, %arg1: tensor<512x128xbf16>) -> tensor<32x32x128xbf16> { %0 = tensor.empty() : tensor<32x32x128xbf16> %1 = "ttir.embedding"(%arg0, %arg1, %0) : (tensor<32x32xbf16>, tensor<512x128xbf16>, tensor<32x32x128xbf16>) -> tensor<32x32x128xbf16> diff --git a/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir b/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir new file mode 100644 index 0000000000..07f60229c9 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir @@ -0,0 +1,4 @@ +// RUN: ttmlir-opt %s +// UNSUPPORTED: true +module { +} diff --git a/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir b/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir new file mode 100644 index 0000000000..b2416fc1ee --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/decomposing_layouts_from_host.mlir @@ -0,0 +1,240 @@ +// RUN: ttmlir-opt --ttir-load-system-desc="path=%system_desc_path%" --ttir-implicit-device %ttmlir_test_root/ttmlir/Silicon/TTNN/Transforms/DecomposeLayouts/create_system_desc_device.mlir > %t.mlir +// RUN: python %ttmlir_scripts_root/extract-and-replace-system-desc-and-device.py %t.mlir %s > %t_replaced.mlir +// RUN: ttmlir-opt --ttnn-decompose-layouts %t_replaced.mlir > %t_ttnn_mlir.mlir +// RUN: FileCheck %t_replaced.mlir --input-file=%t_ttnn_mlir.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t_ttnn_mlir.mlir > %t.ttnn +#device = #tt.device<> +#system_desc = #tt.system_desc<> +#dram = #ttnn.buffer_type +#system_memory = #ttnn.buffer_type +#ttnn_layout_host_rm = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #system_memory>> +#ttnn_layout_host_rm_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>> +#ttnn_layout_host_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #system_memory>> +#ttnn_layout_host_tile_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #system_memory>> +#ttnn_layout_device_rm = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xf32, #dram>, > +#ttnn_layout_device_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, f32>, #dram>, > +#ttnn_layout_device_tile_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, > +#ttnn_layout_device_tile_u32 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, u32>, #dram>, > +#ttnn_layout_device_rm_bf16 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #dram>, > +#ttnn_layout1 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x128xbf16, #system_memory>> +#ttnn_layout2 = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x4x!tt.tile<32x32, bf16>, #dram>, > +module attributes {tt.device = #device, tt.system_desc = #system_desc} { + // Test cases when we do layout transformation from host and we don't change tensor layout and tensor data type + // + + // Test case when we move tensor from host to device. + func.func @from_host_to_device_layout_to_layout_dt_to_dt_create_to_device_op(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_device_rm> { + // Verify that we only insert the to_device op when there are no layout or data type changes. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]]) + // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, > + // CHECK: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_device_rm> + } + + // Test cases when we do layout transformation from host and we don't change tensor layout but we cast tensor data type. + // + + // Test case when we move tensor from host to host for tile case. + func.func @from_host_to_host_layout_to_layout_create_data_cast_op_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_host_tile_bf16> { + // Typecast works only on device. Verify that for the tile case when the output is on host, we insert the to_dtype op to cast the data type on host. + // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: return %[[CASTING_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_host_tile_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_host_tile_bf16> + } + + // Test case when we move tensor from host to host for row-major case. + func.func @from_host_to_host_layout_to_layout_create_data_cast_op_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_host_rm_bf16> { + // Typecast works only on device. Verify that for the row-major case when the output is on host, we insert the to_dtype op to cast the data type on host. + // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: return %[[CASTING_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_host_rm_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_host_rm_bf16> + } + + // Test case when we move tensor from host to device for row-major case. + func.func @from_host_to_device_layout_to_layout_create_data_cast_op_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_device_rm_bf16> { + // Typecast on device only works for tile layout. Verify that for the row-major case we insert the to_dtype op to cast the data type on host and than move the tensor to device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[CASTING_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, > + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_rm_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_device_rm_bf16> + } + + // Test case when we move tensor from host to device for tile case. + func.func @from_host_to_device_layout_to_layout_create_data_cast_op_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> { + // Typecast on device only works for tile layout. Verify that for the tile case we insert the to_device op and the typecast op to cast the data type on device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]]) + // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, > + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.typecast"(%[[TO_DEVICE_OP]]) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: return %[[CASTING_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_tile>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + } + + // Test cases when we do layout transformation from host and we change tensor layout but we don't cast tensor data type. + // + + // Test case when we move tensor from host to host for tile -> row-major case. + func.func @from_host_to_host_dt_to_dt_from_tile_to_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_host_rm> { + // This test verifies that the `to_layout` operation is correctly inserted to change the layout from tile to row-major on the host. + // CHECK: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_host_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_host_rm> + } + + // Test case when we move tensor from host to host for row-major -> tile case. + func.func @from_host_to_host_dt_to_dt_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_host_tile> { + // This test verifies that the `to_layout` operation is correctly inserted to change the layout from row-major to tile on the host. + // CHECK: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_host_tile> + return %1 : tensor<64x128xf32, #ttnn_layout_host_tile> + } + + // Test case when we move tensor from host to device for tile -> row-major case. + func.func @from_host_to_device_dt_to_dt_from_tile_to_rm(%arg0: tensor<64x128xf32, #ttnn_layout_host_tile>) -> tensor<64x128xf32, #ttnn_layout_device_rm> { + // This test verifies that the `to_layout` and `to_device` operations are correctly inserted to change the layout from tile to row-major on the host and than move the tensor to the device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]]) + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_tile>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_device_rm> + } + + // Test case when we move tensor from host to device for row-major -> tile case for bf16 data type. + func.func @from_host_to_device_dt_to_dt_from_rm_to_tile_bf16(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> { + // This test verifies that the `to_device` and `to_layout` operations are correctly inserted to change the layout from row-major to tile on the device. + // Specifically, it ensures that BF16 tiling is performed on the device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]]) + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + } + + // Test case when we move tensor from host to device for row-major -> tile case for non-bf16 data type. + func.func @from_host_to_device_dt_to_dt_from_rm_to_tile_f32(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xf32, #ttnn_layout_device_tile> { + // This test verifies that the `to_layout` and `to_device` operations are correctly inserted to change the layout from row-major to tile on the host for non bf16 data type. + // Specifically, it ensures that non-BF16 tiling is performed on the host and then moved to the device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%arg0) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]]) + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_tile> + return %1 : tensor<64x128xf32, #ttnn_layout_device_tile> + } + + // Test cases when we do layout transformation from host and we change both tensor layout and tensor data type. + // + + // Test case when we move tensor from host to host for tile -> row-major case and data type cast. + func.func @from_host_to_host_from_bf16_to_f32_from_tile_to_rm(%arg0: tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_rm> { + // This test verifies that the `to_layout` and `to_dtype` operations are correctly inserted to change the layout from tile to row-major and cast data type from bf16 to f32 on host. + // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<64x128>>>}> : (tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_host_rm> + } + + // Test case when we move tensor from host to host for row-major -> tile case and data type cast. + func.func @from_host_to_host_from_bf16_to_f32_from_rm_to_tile(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_tile> { + // This test verifies that the `to_layout` and `to_dtype` operations are correctly inserted to change the layout from row-major to tile and cast data type from bf16 to f32 on host. + // CHECK: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %1 = "ttnn.to_layout"(%arg0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#system_memory, <<2x4>>>}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_host_tile> + return %1 : tensor<64x128xf32, #ttnn_layout_host_tile> + } + + // Test case when we move tensor from host to device for tile -> row-major case and cast input from bf16. + func.func @from_host_to_device_data_type_from_bf16_to_f32_from_tile_to_rm(%arg0: tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>) -> tensor<64x128xf32, #ttnn_layout_device_rm> { + // This test verifies that the `to_dtype`, `to_layout` and `to_device` operations are correctly inserted to change the layout from tile to row-major and cast data type from bf16 to f32 on host and then move tensor to device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: memory_config = #ttnn.memory_config<#dram, <<64x128>>, > + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<64x128>>, >}> : (tensor<64x128xbf16, #ttnn_layout_host_tile_bf16>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_rm> + return %1 : tensor<64x128xf32, #ttnn_layout_device_rm> + } + + // Test case when we move tensor from host to device for row-major -> tile case and cast input from bf16. + func.func @from_host_to_device_data_type_from_bf16_to_f32_from_rm_to_tile(%arg0: tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>) -> tensor<64x128xf32, #ttnn_layout_device_tile> { + // This test verifies that the `to_device`, `to_layout` and `typecast` operations are correctly inserted to change the layout from row-major to tile and cast + // data type from bf16 to f32 on device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%arg0, %[[GET_DEVICE_OP]]) + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.typecast"(%[[TO_LAYOUT_OP]]) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: return %[[CASTING_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xbf16, #ttnn_layout_host_rm_bf16>, !tt.device<#device>) -> tensor<64x128xf32, #ttnn_layout_device_tile> + return %1 : tensor<64x128xf32, #ttnn_layout_device_tile> + } + + // Test case when we move tensor from host to device for row-major -> tile case and cast input to bf16. + func.func @from_host_to_device_data_type_from_f32_to_bf16_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> { + // This test verifies that the `to_dtype`, `to_device` and `to_layout` operations are correctly inserted to cast the data type from f32 to bf16 on host and then move tensor to device and change the layout from row-major to tile. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[CASTING_OP]], %[[GET_DEVICE_OP]]) + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[TO_DEVICE_OP]], %[[GET_DEVICE_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: return %[[TO_LAYOUT_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + return %1 : tensor<64x128xbf16, #ttnn_layout_device_tile_bf16> + } + + // Test case when we move tensor from host to device for row-major -> tile case and we don't cast data type to bf16 nor from bf16. + func.func @from_host_to_device_data_type_from_f32_to_u32_from_rm_to_tile(%arg0: tensor<64x128xf32, #ttnn_layout_host_rm>) -> tensor<64x128xi32, #ttnn_layout_device_tile_u32> { + // This test verifies that the `to_dtype`, `to_layout` and `to_device` operations are correctly inserted to cast the data type from f32 to f16 and tilize on host and then move tensor to device. + // CHECK: %[[GET_DEVICE_OP:.*]] = "ttnn.get_device"() + // CHECK-NEXT: %[[CASTING_OP:.*]] = "ttnn.to_dtype"(%arg0) + // CHECK-SAME: dtype = #tt.supportedDataTypes + // CHECK-NEXT: %[[TO_LAYOUT_OP:.*]] = "ttnn.to_layout"(%[[CASTING_OP]]) + // CHECK-SAME: layout = #ttnn.layout + // CHECK-NEXT: %[[TO_DEVICE_OP:.*]] = "ttnn.to_device"(%[[TO_LAYOUT_OP]], %[[GET_DEVICE_OP]]) + // CHECK-NEXT: return %[[TO_DEVICE_OP]] + %0 = "ttnn.get_device"() <{mesh_shape = #ttnn}> : () -> !tt.device<#device> + %1 = "ttnn.to_layout"(%arg0, %0) <{dtype = #tt.supportedDataTypes, layout = #ttnn.layout, memory_config = #ttnn.memory_config<#dram, <<2x4>>, >}> : (tensor<64x128xf32, #ttnn_layout_host_rm>, !tt.device<#device>) -> tensor<64x128xi32, #ttnn_layout_device_tile_u32> + return %1 : tensor<64x128xi32, #ttnn_layout_device_tile_u32> + } +} diff --git a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_clamp.mlir b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_clamp.mlir index 44806c22df..5f5239c40b 100644 --- a/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_clamp.mlir +++ b/test/ttmlir/Silicon/TTNN/perf_unit/test_perf_clamp.mlir @@ -4,8 +4,9 @@ func.func @clamp(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> { %0 = tensor.empty() : tensor<64x128xbf16> + // CHECK: %[[GET_DEVICE:.*]] = "ttnn.get_device"() // CHECK: %[[DEVICE:.*]] = "ttnn.to_device"(%arg0, - // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]]) + // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]], %[[GET_DEVICE]]) // CHECK: = "ttnn.clamp"(%[[LAYOUT]]) // CHECK-SAME: {max = 3.000000e+00 : f32, min = 2.000000e+00 : f32} // CHECK-SAME: [[TENSOR:tensor<64x128xbf16]], #ttnn_layout{{[0-9]+}}>) -> [[TENSOR]] diff --git a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir index a0452f01f8..52c99bae48 100644 --- a/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir +++ b/test/ttmlir/Silicon/TTNN/simple_eltwise.mlir @@ -19,8 +19,9 @@ func.func @ceil(%arg0: tensor<32x32xf32>) -> tensor<32x32xf32> { func.func @clamp(%arg0: tensor<64x128xbf16>) -> tensor<64x128xbf16> { %0 = tensor.empty() : tensor<64x128xbf16> + // CHECK: %[[GET_DEVICE:.*]] = "ttnn.get_device"() // CHECK: %[[DEVICE:.*]] = "ttnn.to_device"(%arg0, - // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]]) + // CHECK: %[[LAYOUT:.*]] = "ttnn.to_layout"(%[[DEVICE]], %[[GET_DEVICE]]) // CHECK: = "ttnn.clamp"(%[[LAYOUT]]) // CHECK-SAME: {max = 3.000000e+00 : f32, min = 2.000000e+00 : f32} // CHECK-SAME: [[TENSOR:tensor<64x128xbf16]], #ttnn_layout{{[0-9]+}}>) -> [[TENSOR]] diff --git a/tools/scripts/extract-and-replace-system-desc-and-device.py b/tools/scripts/extract-and-replace-system-desc-and-device.py new file mode 100644 index 0000000000..cddbd2f549 --- /dev/null +++ b/tools/scripts/extract-and-replace-system-desc-and-device.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 +import sys +import os + + +def main(input_file, output_file): + # Read content from the input file + with open(input_file, "r") as file: + system_desc = "" + device_desc = "" + for line in file: + if "#system_desc =" in line: + system_desc = line.strip() + if "#device =" in line: + device_desc = line.strip() + + # Write the modified content to the output file + modified_content = "" + with open(output_file, "r") as file: + for line in file: + # print(line) + if line.strip().startswith("#device ="): + modified_content += device_desc + elif line.strip().startswith("#system_desc ="): + modified_content += system_desc + elif line.strip().startswith("// RUN:"): + continue + else: + modified_content += line + + print(modified_content) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print( + "Usage: python extract-and-replace-system-desc-and-device.py " + ) + sys.exit(1) + + input_file = sys.argv[1] + output_file = sys.argv[2] + + main(input_file, output_file)