#14372: fix transpose hc RM pcc errors + add N-dimensional permute si…

…ngle core implementation (#14388) #14372: fix transpose hc RM pcc errors - Restriction was on alignment, which was hard coded to 16 for L1, which caused PCC issues when read from DRAM #14370: add basic N-d permute code and support both N-d transpose and permute - TODO: make multicore, single-core for now
tenstorrent · Oct 31, 2024 · 89aa76d · 89aa76d
1 parent c0d46d5
commit 89aa76d
Show file tree

Hide file tree

Showing 12 changed files with 504 additions and 23 deletions.
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -743,21 +743,23 @@ def test_transpose_4d_wh_tile(shape, device):
 @pytest.mark.parametrize(
     "config",
     [
-        [[1, 8, 4096, 40], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # bad pcc
-        [[1, 9, 8, 40], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # bad pcc
         [[64, 4, 49, 32], [-2, -1], ttnn.ROW_MAJOR_LAYOUT],  # Page size must be divisible by sizeof(uint32_t)
-        [[1, 1370, 1, 3, 1280], [0, -2], ttnn.ROW_MAJOR_LAYOUT],  # greater than 4D
+        [[1, 1370, 1, 3, 1280], [0, -2], ttnn.TILE_LAYOUT],  # untilize doesn't work with 4D
         [[12, 3], [0, 1], ttnn.ROW_MAJOR_LAYOUT],  # need tensor for this one
     ],
 )
-def test_transpose_failures(config, device):
+@pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG])
+def test_transpose_failures(config, memory_config, device):
     pytest.skip("Failures to fix after #13217 and #13005 are in - 5D, HC PCC issue and unaligned RM tensor")
     torch_input = torch.randn(config[0], dtype=torch.bfloat16)
     torch_output = torch_input.transpose(config[1][0], config[1][1])
 
-    tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=config[2], device=device)
+    tt_input = ttnn.from_torch(
+        torch_input, dtype=ttnn.DataType.BFLOAT16, layout=config[2], device=device, memory_config=memory_config
+    )
     tt_output = ttnn.transpose(tt_input, config[1][0], config[1][1])
     tt_output = ttnn.to_torch(tt_output)
+
     assert_with_pcc(torch_output, tt_output, 0.9999)
 
 
@@ -769,24 +771,76 @@ def test_transpose_failures(config, device):
             [1, 16, 6, 64],
             [-1, -2],
             ttnn.ROW_MAJOR_LAYOUT,
-        ],  # (W * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH == 0 && (H * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH)
+        ],
         [
             [1, 16, 64, 6],
             [-1, -2],
             ttnn.ROW_MAJOR_LAYOUT,
-        ],  # (W * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH == 0 && (H * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH)
+        ],
         [
             [1, 16, 64, 6],
             [1, 2],
             ttnn.ROW_MAJOR_LAYOUT,
-        ],  # (W * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH == 0 for HC as well...
+        ],
+        [[1, 9, 8, 18], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # unaligned RM that fallsback to tiled
+        [[1, 9, 8, 14], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # unaligned RM that fallsback to tiled
+        [[1, 9, 8, 2], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # unaligned RM that fallsback to tiled
+        [[1, 2, 8, 2], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # unaligned RM that fallsback to tiled
+        [
+            [1, 8, 4096, 40],
+            [1, 2],
+            ttnn.ROW_MAJOR_LAYOUT,
+        ],  # RM that fallsback to tiled only when reading from DRAM (32B alignment requirement on DRAM, 16B on L1)
+        [[1, 9, 8, 40], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # RM that fallsback to tiled only when reading from DRAM
+        [[1, 8, 8, 8], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # RM that fallsback to tiled only when reading from DRAM
     ],
 )
-def test_transpose_unaligned(config, device):
+@pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG])
+def test_transpose_unaligned(config, memory_config, device):
     # this will convert to tiled for now
     torch_input = torch.randn(config[0], dtype=torch.bfloat16)
     torch_output = torch_input.transpose(config[1][0], config[1][1])
-    tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=config[2], device=device)
+    tt_input = ttnn.from_torch(
+        torch_input, dtype=ttnn.DataType.BFLOAT16, layout=config[2], device=device, memory_config=memory_config
+    )
     tt_output = ttnn.transpose(tt_input, config[1][0], config[1][1])
     tt_output = ttnn.to_torch(tt_output)
     assert_with_pcc(torch_output, tt_output, 0.9999)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [(1, 2, 32, 100), (1, 35, 7, 7), (1, 1, 1, 1)],
+)
+def test_transpose_hc_padded_c(shape, device):
+    # this will convert to tiled for now
+    torch_input = torch.randn(shape, dtype=torch.bfloat16)
+    torch_output = torch_input.transpose(1, 2)
+    tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=ttnn.TILE_LAYOUT, device=device)
+    tt_output = ttnn.transpose(tt_input, 1, 2)
+    tt_output = ttnn.to_torch(tt_output)
+    assert_with_pcc(torch_output, tt_output, 0.9999)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [[1, 197, 1, 3, 1024], [1, 197, 1, 3, 768], [1, 50, 1, 3, 1024], [1, 50, 1, 3, 768], [1, 1370, 1, 3, 1280]],
+)
+@pytest.mark.parametrize(
+    "dims",
+    [
+        (0, -2),
+    ],
+)
+@pytest.mark.parametrize(
+    "layout",
+    [ttnn.ROW_MAJOR_LAYOUT],
+)
+def test_transpose_5d(shape, dims, layout, device):
+    torch_input = torch.randn(shape, dtype=torch.bfloat16)
+    torch_output = torch_input.transpose(dims[0], dims[1])
+
+    tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=layout, device=device)
+    tt_output = ttnn.transpose(tt_input, dims[0], dims[1])
+    tt_output = ttnn.to_torch(tt_output)
+    assert_with_pcc(torch_output, tt_output, 0.9999)
diff --git a/tests/ttnn/unit_tests/operations/test_permute.py b/tests/ttnn/unit_tests/operations/test_permute.py
@@ -10,6 +10,8 @@
 
 from tests.ttnn.utils_for_testing import assert_with_pcc
 
+torch.manual_seed(2005)
+
 
 @pytest.mark.parametrize("h", [32])
 @pytest.mark.parametrize("w", [64])
@@ -129,3 +131,18 @@ def test_permute_bfloat8(device):
     tt_output = ttnn.permute(tt_input, (0, 2, 3, 1))
     tt_output = ttnn.to_torch(tt_output)
     assert_with_pcc(torch_output, tt_output, 0.9999)
+
+
+@pytest.mark.parametrize(
+    "shape", [(8, 2, 2, 3, 4), [1, 1370, 1, 3, 1280], [1, 197, 1, 3, 1024], [1, 197, 1, 3, 768], [1, 50, 1, 3, 1024]]
+)
+@pytest.mark.parametrize("perm", [(0, 3, 2, 1, 4), (3, 1, 2, 0, 4), (0, 3, 2, 1, 4), (1, 3, 2, 0, 4), (0, 3, 1, 2, 4)])
+def test_permute_5d(shape, perm, device):
+    input_a = torch.randn(shape)
+    torch_output = torch.permute(input_a, perm)
+
+    tt_input = ttnn.from_torch(input_a, device=device, layout=ttnn.ROW_MAJOR_LAYOUT, dtype=ttnn.bfloat16)
+
+    tt_output = ttnn.permute(tt_input, perm)
+    tt_output = ttnn.to_torch(tt_output)
+    assert_with_pcc(torch_output, tt_output, 0.9999)
diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp
@@ -49,7 +49,7 @@ void validate_buffer_size_and_page_size(
         size);
     TT_FATAL(
         page_size % sizeof(uint32_t) == 0,
-        "Page size must be divisible by sizeof(uint32_t) because buffers hold uint32_t values");
+        "Page size {} must be divisible by sizeof(uint32_t) because buffers hold uint32_t values", page_size);
 
     if (is_sharded(buffer_layout)) {
         TT_FATAL(

diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt
@@ -83,6 +83,8 @@ set(ALL_TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/pad/pad.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/permute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/device/permute_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/repeat/device/repeat_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/repeat/repeat.cpp

diff --git a/...perations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_rm.cpp b/...perations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_rm.cpp
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    constexpr bool src0_is_dram          = (bool) get_compile_time_arg_val(0);
+    constexpr uint32_t N                 = get_compile_time_arg_val(1);
+    constexpr uint32_t page_size         = get_compile_time_arg_val(2);
+    constexpr uint32_t num_rows          = get_compile_time_arg_val(3);
+
+    const uint32_t src_addr = get_arg_val<uint32_t>(0);
+
+    const InterleavedAddrGen<src0_is_dram> s0 = {
+        .bank_base_address = src_addr,
+        .page_size = page_size
+    };
+
+    uint32_t curr_addr = src_addr;
+    for (uint32_t i = 0; i < num_rows; ++i) {
+        cb_reserve_back(tt::CB::c_in0, 1);
+        uint32_t src_buffer_l1_addr = get_write_ptr(tt::CB::c_in0);
+        noc_async_read_page(i, s0, src_buffer_l1_addr);
+        noc_async_read_barrier();
+        volatile tt_l1_ptr uint16_t* out_stick = reinterpret_cast<volatile tt_l1_ptr uint16_t*>(src_buffer_l1_addr);
+        cb_push_back(tt::CB::c_in0, 1);
+    }
+
+}
diff --git a/...perations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm.cpp b/...perations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm.cpp
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+
+void kernel_main() {
+    constexpr bool dst_is_dram          = (bool) get_compile_time_arg_val(0);
+    constexpr uint32_t N                 = get_compile_time_arg_val(1);
+    constexpr uint32_t page_size         = get_compile_time_arg_val(2);
+    constexpr uint32_t num_rows          = get_compile_time_arg_val(3);
+
+    const uint32_t dst_addr = get_arg_val<uint32_t>(0);
+
+    const InterleavedAddrGen<dst_is_dram> s0 = {
+        .bank_base_address = dst_addr,
+        .page_size = page_size
+    };
+
+    uint32_t input_shape[N], perm[N], dest_strides[N];
+    for (uint32_t i = 1; i <= N; i++) {
+        input_shape[i - 1] = get_arg_val<uint32_t>(i);
+        perm[i - 1] = get_arg_val<uint32_t>(i + N);
+        dest_strides[i - 1] = get_arg_val<uint32_t>(i + 2*N);
+    }
+
+    uint32_t src_buffer_l1_addr = get_write_ptr(tt::CB::c_in0);
+    uint32_t curr_addr = dst_addr;
+    for (uint32_t row = 0; row < num_rows; ++row) {
+        // Compute multi-dimensional index for the source row
+        uint32_t src_multi_idx[N];
+        size_t remaining = row;
+        for(uint32_t i = 0; i < N - 1; ++i) {
+            size_t dim = N - 2 - i;  // Start from the second last dimension
+            src_multi_idx[dim] = remaining % input_shape[dim];
+            remaining /= input_shape[dim];
+        }
+        src_multi_idx[N - 1] = 0; // Row dimension index
+
+        // Apply permutation to get destination multi-dimensional index
+        uint32_t dest_multi_idx[N];
+        for(uint32_t i = 0; i < N; ++i) {
+            dest_multi_idx[i] = src_multi_idx[perm[i]];
+        }
+
+        // Convert destination multi-dimensional index to linear index
+        uint32_t dest_linear_idx = 0;
+        for(uint32_t i = 0; i < N - 1; ++i) {
+            dest_linear_idx += dest_multi_idx[i] * dest_strides[i];
+        }
+        cb_wait_front(tt::CB::c_in0, 1);
+        uint32_t l1_read_addr = get_read_ptr(tt::CB::c_in0);
+        uint64_t dst_noc_addr = get_noc_addr(dest_linear_idx, s0);
+        noc_async_write(l1_read_addr, dst_noc_addr, page_size);
+        noc_async_write_barrier();
+        cb_pop_front(tt::CB::c_in0, 1);
+    }
+
+}
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "ttnn/cpp/ttnn/tensor/types.hpp"
+#include "ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"
+
+namespace ttnn::operations::data_movement {
+
+PermuteDeviceOperation::program_factory_t PermuteDeviceOperation::select_program_factory(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    return SingleCore{};
+}
+
+void PermuteDeviceOperation::validate_on_program_cache_miss(
+    const operation_attributes_t& attributes, const tensor_args_t& tensor_args) {
+        TT_FATAL(attributes.dims.size() == tensor_args.input_tensor.get_logical_shape().rank(),
+                 "Permute dimensions must match input tensor rank");
+        TT_FATAL(attributes.dims.back() == tensor_args.input_tensor.get_logical_shape().rank() - 1,
+                 "Last dimension of permute must be the last dimension of the input tensor as page-breaking is not supported at the moment");
+        TT_FATAL(tensor_args.input_tensor.is_sharded() == false,
+                 "Permute operation does not support sharded input tensor");
+        TT_FATAL(tensor_args.input_tensor.get_layout() == Layout::ROW_MAJOR, "Permute operation only supports row-major layout");
+}
+
+void PermuteDeviceOperation::validate_on_program_cache_hit(
+    const operation_attributes_t& attributes, const tensor_args_t& tensor_args) {}
+
+PermuteDeviceOperation::shape_return_value_t PermuteDeviceOperation::compute_output_shapes(
+    const operation_attributes_t& attributes, const tensor_args_t& tensor_args) {
+    SmallVector<uint32_t> shape, padded_shape;
+    auto input_shape = tensor_args.input_tensor.get_logical_shape();
+    shape.reserve(input_shape.rank());
+    for (auto dim : attributes.dims) {
+        shape.push_back(input_shape[dim]);
+    }
+    return ttnn::SimpleShape(shape);
+}
+
+PermuteDeviceOperation::tensor_return_value_t PermuteDeviceOperation::create_output_tensors(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    if (tensor_args.optional_output_tensor.has_value()) {
+        return tensor_args.optional_output_tensor.value();
+    }
+    auto output_shape = compute_output_shapes(operation_attributes, tensor_args);
+    const auto& input_tensor = tensor_args.input_tensor;
+    return create_device_tensor(
+        output_shape,
+        input_tensor.tensor_attributes->dtype,
+        input_tensor.tensor_attributes->layout,
+        input_tensor.device());
+}
+
+
+std::tuple<PermuteDeviceOperation::operation_attributes_t, PermuteDeviceOperation::tensor_args_t>
+PermuteDeviceOperation::invoke(const Tensor& input_tensor, const SmallVector<uint32_t>& dims,
+            const std::optional<MemoryConfig>& memory_config, std::optional<Tensor> optional_output_tensor) {
+    return {
+        operation_attributes_t{.dims=dims,
+        .output_mem_config=memory_config.value_or(input_tensor.memory_config())},
+        tensor_args_t{.input_tensor=input_tensor, .optional_output_tensor=optional_output_tensor}
+    };
+}
+
+
+}  // namespace ttnn::operations::data_movement