Skip to content

Commit

Permalink
#14372: fix transpose hc RM pcc errors + add N-dimensional permute si…
Browse files Browse the repository at this point in the history
…ngle core implementation (#14388)

#14372: fix transpose hc RM pcc errors
- Restriction was on alignment, which was hard coded to 16 for L1, which caused PCC issues when read from DRAM
#14370: add basic N-d permute code and support both N-d transpose and permute
- TODO: make multicore, single-core for now
  • Loading branch information
sjameelTT authored Oct 31, 2024
1 parent c0d46d5 commit 89aa76d
Show file tree
Hide file tree
Showing 12 changed files with 504 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -743,21 +743,23 @@ def test_transpose_4d_wh_tile(shape, device):
@pytest.mark.parametrize(
"config",
[
[[1, 8, 4096, 40], [1, 2], ttnn.ROW_MAJOR_LAYOUT], # bad pcc
[[1, 9, 8, 40], [1, 2], ttnn.ROW_MAJOR_LAYOUT], # bad pcc
[[64, 4, 49, 32], [-2, -1], ttnn.ROW_MAJOR_LAYOUT], # Page size must be divisible by sizeof(uint32_t)
[[1, 1370, 1, 3, 1280], [0, -2], ttnn.ROW_MAJOR_LAYOUT], # greater than 4D
[[1, 1370, 1, 3, 1280], [0, -2], ttnn.TILE_LAYOUT], # untilize doesn't work with 4D
[[12, 3], [0, 1], ttnn.ROW_MAJOR_LAYOUT], # need tensor for this one
],
)
def test_transpose_failures(config, device):
@pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG])
def test_transpose_failures(config, memory_config, device):
pytest.skip("Failures to fix after #13217 and #13005 are in - 5D, HC PCC issue and unaligned RM tensor")
torch_input = torch.randn(config[0], dtype=torch.bfloat16)
torch_output = torch_input.transpose(config[1][0], config[1][1])

tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=config[2], device=device)
tt_input = ttnn.from_torch(
torch_input, dtype=ttnn.DataType.BFLOAT16, layout=config[2], device=device, memory_config=memory_config
)
tt_output = ttnn.transpose(tt_input, config[1][0], config[1][1])
tt_output = ttnn.to_torch(tt_output)

assert_with_pcc(torch_output, tt_output, 0.9999)


Expand All @@ -769,24 +771,76 @@ def test_transpose_failures(config, device):
[1, 16, 6, 64],
[-1, -2],
ttnn.ROW_MAJOR_LAYOUT,
], # (W * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH == 0 && (H * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH)
],
[
[1, 16, 64, 6],
[-1, -2],
ttnn.ROW_MAJOR_LAYOUT,
], # (W * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH == 0 && (H * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH)
],
[
[1, 16, 64, 6],
[1, 2],
ttnn.ROW_MAJOR_LAYOUT,
], # (W * input_tensor.element_size()) % ROW_MAJOR_STICK_WIDTH == 0 for HC as well...
],
[[1, 9, 8, 18], [1, 2], ttnn.ROW_MAJOR_LAYOUT], # unaligned RM that fallsback to tiled
[[1, 9, 8, 14], [1, 2], ttnn.ROW_MAJOR_LAYOUT], # unaligned RM that fallsback to tiled
[[1, 9, 8, 2], [1, 2], ttnn.ROW_MAJOR_LAYOUT], # unaligned RM that fallsback to tiled
[[1, 2, 8, 2], [1, 2], ttnn.ROW_MAJOR_LAYOUT], # unaligned RM that fallsback to tiled
[
[1, 8, 4096, 40],
[1, 2],
ttnn.ROW_MAJOR_LAYOUT,
], # RM that fallsback to tiled only when reading from DRAM (32B alignment requirement on DRAM, 16B on L1)
[[1, 9, 8, 40], [1, 2], ttnn.ROW_MAJOR_LAYOUT], # RM that fallsback to tiled only when reading from DRAM
[[1, 8, 8, 8], [1, 2], ttnn.ROW_MAJOR_LAYOUT], # RM that fallsback to tiled only when reading from DRAM
],
)
def test_transpose_unaligned(config, device):
@pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG])
def test_transpose_unaligned(config, memory_config, device):
# this will convert to tiled for now
torch_input = torch.randn(config[0], dtype=torch.bfloat16)
torch_output = torch_input.transpose(config[1][0], config[1][1])
tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=config[2], device=device)
tt_input = ttnn.from_torch(
torch_input, dtype=ttnn.DataType.BFLOAT16, layout=config[2], device=device, memory_config=memory_config
)
tt_output = ttnn.transpose(tt_input, config[1][0], config[1][1])
tt_output = ttnn.to_torch(tt_output)
assert_with_pcc(torch_output, tt_output, 0.9999)


@pytest.mark.parametrize(
"shape",
[(1, 2, 32, 100), (1, 35, 7, 7), (1, 1, 1, 1)],
)
def test_transpose_hc_padded_c(shape, device):
# this will convert to tiled for now
torch_input = torch.randn(shape, dtype=torch.bfloat16)
torch_output = torch_input.transpose(1, 2)
tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=ttnn.TILE_LAYOUT, device=device)
tt_output = ttnn.transpose(tt_input, 1, 2)
tt_output = ttnn.to_torch(tt_output)
assert_with_pcc(torch_output, tt_output, 0.9999)


@pytest.mark.parametrize(
"shape",
[[1, 197, 1, 3, 1024], [1, 197, 1, 3, 768], [1, 50, 1, 3, 1024], [1, 50, 1, 3, 768], [1, 1370, 1, 3, 1280]],
)
@pytest.mark.parametrize(
"dims",
[
(0, -2),
],
)
@pytest.mark.parametrize(
"layout",
[ttnn.ROW_MAJOR_LAYOUT],
)
def test_transpose_5d(shape, dims, layout, device):
torch_input = torch.randn(shape, dtype=torch.bfloat16)
torch_output = torch_input.transpose(dims[0], dims[1])

tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=layout, device=device)
tt_output = ttnn.transpose(tt_input, dims[0], dims[1])
tt_output = ttnn.to_torch(tt_output)
assert_with_pcc(torch_output, tt_output, 0.9999)
17 changes: 17 additions & 0 deletions tests/ttnn/unit_tests/operations/test_permute.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

from tests.ttnn.utils_for_testing import assert_with_pcc

torch.manual_seed(2005)


@pytest.mark.parametrize("h", [32])
@pytest.mark.parametrize("w", [64])
Expand Down Expand Up @@ -129,3 +131,18 @@ def test_permute_bfloat8(device):
tt_output = ttnn.permute(tt_input, (0, 2, 3, 1))
tt_output = ttnn.to_torch(tt_output)
assert_with_pcc(torch_output, tt_output, 0.9999)


@pytest.mark.parametrize(
"shape", [(8, 2, 2, 3, 4), [1, 1370, 1, 3, 1280], [1, 197, 1, 3, 1024], [1, 197, 1, 3, 768], [1, 50, 1, 3, 1024]]
)
@pytest.mark.parametrize("perm", [(0, 3, 2, 1, 4), (3, 1, 2, 0, 4), (0, 3, 2, 1, 4), (1, 3, 2, 0, 4), (0, 3, 1, 2, 4)])
def test_permute_5d(shape, perm, device):
input_a = torch.randn(shape)
torch_output = torch.permute(input_a, perm)

tt_input = ttnn.from_torch(input_a, device=device, layout=ttnn.ROW_MAJOR_LAYOUT, dtype=ttnn.bfloat16)

tt_output = ttnn.permute(tt_input, perm)
tt_output = ttnn.to_torch(tt_output)
assert_with_pcc(torch_output, tt_output, 0.9999)
2 changes: 1 addition & 1 deletion tt_metal/impl/buffers/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ void validate_buffer_size_and_page_size(
size);
TT_FATAL(
page_size % sizeof(uint32_t) == 0,
"Page size must be divisible by sizeof(uint32_t) because buffers hold uint32_t values");
"Page size {} must be divisible by sizeof(uint32_t) because buffers hold uint32_t values", page_size);

if (is_sharded(buffer_layout)) {
TT_FATAL(
Expand Down
2 changes: 2 additions & 0 deletions ttnn/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ set(ALL_TTNN_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/pad/pad.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/permute.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/device/permute_program_factory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/repeat/device/repeat_op.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/repeat/repeat.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>
#include "dataflow_api.h"

void kernel_main() {
constexpr bool src0_is_dram = (bool) get_compile_time_arg_val(0);
constexpr uint32_t N = get_compile_time_arg_val(1);
constexpr uint32_t page_size = get_compile_time_arg_val(2);
constexpr uint32_t num_rows = get_compile_time_arg_val(3);

const uint32_t src_addr = get_arg_val<uint32_t>(0);

const InterleavedAddrGen<src0_is_dram> s0 = {
.bank_base_address = src_addr,
.page_size = page_size
};

uint32_t curr_addr = src_addr;
for (uint32_t i = 0; i < num_rows; ++i) {
cb_reserve_back(tt::CB::c_in0, 1);
uint32_t src_buffer_l1_addr = get_write_ptr(tt::CB::c_in0);
noc_async_read_page(i, s0, src_buffer_l1_addr);
noc_async_read_barrier();
volatile tt_l1_ptr uint16_t* out_stick = reinterpret_cast<volatile tt_l1_ptr uint16_t*>(src_buffer_l1_addr);
cb_push_back(tt::CB::c_in0, 1);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>
#include "dataflow_api.h"


void kernel_main() {
constexpr bool dst_is_dram = (bool) get_compile_time_arg_val(0);
constexpr uint32_t N = get_compile_time_arg_val(1);
constexpr uint32_t page_size = get_compile_time_arg_val(2);
constexpr uint32_t num_rows = get_compile_time_arg_val(3);

const uint32_t dst_addr = get_arg_val<uint32_t>(0);

const InterleavedAddrGen<dst_is_dram> s0 = {
.bank_base_address = dst_addr,
.page_size = page_size
};

uint32_t input_shape[N], perm[N], dest_strides[N];
for (uint32_t i = 1; i <= N; i++) {
input_shape[i - 1] = get_arg_val<uint32_t>(i);
perm[i - 1] = get_arg_val<uint32_t>(i + N);
dest_strides[i - 1] = get_arg_val<uint32_t>(i + 2*N);
}

uint32_t src_buffer_l1_addr = get_write_ptr(tt::CB::c_in0);
uint32_t curr_addr = dst_addr;
for (uint32_t row = 0; row < num_rows; ++row) {
// Compute multi-dimensional index for the source row
uint32_t src_multi_idx[N];
size_t remaining = row;
for(uint32_t i = 0; i < N - 1; ++i) {
size_t dim = N - 2 - i; // Start from the second last dimension
src_multi_idx[dim] = remaining % input_shape[dim];
remaining /= input_shape[dim];
}
src_multi_idx[N - 1] = 0; // Row dimension index

// Apply permutation to get destination multi-dimensional index
uint32_t dest_multi_idx[N];
for(uint32_t i = 0; i < N; ++i) {
dest_multi_idx[i] = src_multi_idx[perm[i]];
}

// Convert destination multi-dimensional index to linear index
uint32_t dest_linear_idx = 0;
for(uint32_t i = 0; i < N - 1; ++i) {
dest_linear_idx += dest_multi_idx[i] * dest_strides[i];
}
cb_wait_front(tt::CB::c_in0, 1);
uint32_t l1_read_addr = get_read_ptr(tt::CB::c_in0);
uint64_t dst_noc_addr = get_noc_addr(dest_linear_idx, s0);
noc_async_write(l1_read_addr, dst_noc_addr, page_size);
noc_async_write_barrier();
cb_pop_front(tt::CB::c_in0, 1);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>

#include "ttnn/cpp/ttnn/tensor/types.hpp"
#include "ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.hpp"

namespace ttnn::operations::data_movement {

PermuteDeviceOperation::program_factory_t PermuteDeviceOperation::select_program_factory(
const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
return SingleCore{};
}

void PermuteDeviceOperation::validate_on_program_cache_miss(
const operation_attributes_t& attributes, const tensor_args_t& tensor_args) {
TT_FATAL(attributes.dims.size() == tensor_args.input_tensor.get_logical_shape().rank(),
"Permute dimensions must match input tensor rank");
TT_FATAL(attributes.dims.back() == tensor_args.input_tensor.get_logical_shape().rank() - 1,
"Last dimension of permute must be the last dimension of the input tensor as page-breaking is not supported at the moment");
TT_FATAL(tensor_args.input_tensor.is_sharded() == false,
"Permute operation does not support sharded input tensor");
TT_FATAL(tensor_args.input_tensor.get_layout() == Layout::ROW_MAJOR, "Permute operation only supports row-major layout");
}

void PermuteDeviceOperation::validate_on_program_cache_hit(
const operation_attributes_t& attributes, const tensor_args_t& tensor_args) {}

PermuteDeviceOperation::shape_return_value_t PermuteDeviceOperation::compute_output_shapes(
const operation_attributes_t& attributes, const tensor_args_t& tensor_args) {
SmallVector<uint32_t> shape, padded_shape;
auto input_shape = tensor_args.input_tensor.get_logical_shape();
shape.reserve(input_shape.rank());
for (auto dim : attributes.dims) {
shape.push_back(input_shape[dim]);
}
return ttnn::SimpleShape(shape);
}

PermuteDeviceOperation::tensor_return_value_t PermuteDeviceOperation::create_output_tensors(
const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
if (tensor_args.optional_output_tensor.has_value()) {
return tensor_args.optional_output_tensor.value();
}
auto output_shape = compute_output_shapes(operation_attributes, tensor_args);
const auto& input_tensor = tensor_args.input_tensor;
return create_device_tensor(
output_shape,
input_tensor.tensor_attributes->dtype,
input_tensor.tensor_attributes->layout,
input_tensor.device());
}


std::tuple<PermuteDeviceOperation::operation_attributes_t, PermuteDeviceOperation::tensor_args_t>
PermuteDeviceOperation::invoke(const Tensor& input_tensor, const SmallVector<uint32_t>& dims,
const std::optional<MemoryConfig>& memory_config, std::optional<Tensor> optional_output_tensor) {
return {
operation_attributes_t{.dims=dims,
.output_mem_config=memory_config.value_or(input_tensor.memory_config())},
tensor_args_t{.input_tensor=input_tensor, .optional_output_tensor=optional_output_tensor}
};
}


} // namespace ttnn::operations::data_movement
Loading

0 comments on commit 89aa76d

Please sign in to comment.