Skip to content

Commit

Permalink
#16758: Optimize usage and implementation of encode/decode tensor data
Browse files Browse the repository at this point in the history
- Switch to rval inputs for encode/decode and move vectors when possible
  * Switch to rval for passing buffer in create_row_major_owned_buffer
- Skip unnecessary conversion steps in encode/decode tensor data when possible
  * If no conversions are neeeded, only minor overhead for moving vectors (no copies)
- Add support for vectors in convert_layout_row_major_to_tile and convert_layout_tile_to_row_major
  * Switch to variadic args in template for BufferType
  * Switch to directly use vectors instead of creating an owned buffer when calling
- Cache logical_2d_shape calculation in tensor spec
  * Move get_2d_shape to tensor layout
- Add unnamed namespace around CMAKE_UNIQUE_NAMESPACE in tensor_impl
  • Loading branch information
TT-BrianLiu committed Jan 16, 2025
1 parent be5c2c6 commit db1d0cb
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 97 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,19 +106,19 @@ TEST_P(ShardWithAlignmentTests, LogicalToPhysical) {
auto physical_shape = tensor_spec.physical_shape();
ASSERT_EQ(physical_shape, params.expected.physical_shape);

const auto& logical_data = params.inputs.logical_data;
auto logical_data = params.inputs.logical_data;
const auto& expected_physical_data = params.expected.physical_data;

// Convert output physical data to row major (if necessary) for testing
auto physical_data = tensor_impl::encode_tensor_data(logical_data, tensor_spec);
auto physical_data = tensor_impl::encode_tensor_data(std::move(logical_data), tensor_spec);
if (tensor_spec.layout() == Layout::TILE) {
// TODO: Fix convert_layout_tile_to_row_major to take in vector instead of buffer?
physical_data = tensor_impl::convert_layout_tile_to_row_major(
physical_shape, tensor_spec.tile(), owned_buffer::create(std::move(physical_data)));
}

// auto shape_2D = tt::tt_metal::get_2d_shape(tensor_spec.logical_shape());
// pretty_print_data_as_shards(logical_data, shape_2D, logical_shard_shape);
// auto shape_2d = tensor_spec.logical_2d_shape();
// pretty_print_data_as_shards(params.inputs.logical_data, shape_2d, logical_shard_shape);
// pretty_print_data_as_shards(physical_data, physical_shape, physical_shard_shape);

ASSERT_EQ(physical_data.size(), expected_physical_data.size());
Expand Down Expand Up @@ -164,11 +164,11 @@ TEST_P(ShardWithAlignmentTests, PhysicalToLogical) {
physical_data = tensor_impl::convert_layout_row_major_to_tile(
physical_shape, tensor_spec.tile(), owned_buffer::create(std::move(physical_data)));
}
auto logical_data = tensor_impl::decode_tensor_data(physical_data, tensor_spec);
auto logical_data = tensor_impl::decode_tensor_data(std::move(physical_data), tensor_spec);

// auto shape_2D = tt::tt_metal::get_2d_shape(tensor_spec.logical_shape());
// pretty_print_data_as_shards(physical_data, physical_shape, physical_shard_shape);
// pretty_print_data_as_shards(logical_data, shape_2D, logical_shard_shape);
// auto shape_2d = tensor_spec.logical_2d_shape();
// pretty_print_data_as_shards(params.expected.physical_data, physical_shape, physical_shard_shape);
// pretty_print_data_as_shards(logical_data, shape_2d, logical_shard_shape);

ASSERT_EQ(logical_data.size(), expected_data.size());
for (size_t i = 0; i < logical_data.size(); i++) {
Expand Down
23 changes: 11 additions & 12 deletions ttnn/cpp/pybind11/pytensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ Tensor create_owned_tensor(T* data_ptr, const ttnn::TensorSpec& tensor_spec) {
auto logical_data = std::vector<T>(data_ptr, data_ptr + num_elements);

// See implementation for documentation
auto physical_data = tensor_impl::encode_tensor_data(logical_data, tensor_spec);
auto physical_data = tensor_impl::encode_tensor_data(std::move(logical_data), tensor_spec);

auto buffer = owned_buffer::create(std::move(physical_data));
auto storage = OwnedStorage{std::move(buffer)};
Expand All @@ -91,8 +91,7 @@ Tensor create_tt_tensor_from_py_data(
const std::function<void()>& on_destruction_callback) {
auto layout = tensor_spec.layout();

const bool requires_padding = tensor_spec.logical_shape().volume() !=
tensor_spec.physical_shape().height() * tensor_spec.physical_shape().width();
const bool requires_padding = tensor_spec.logical_2d_shape() != tensor_spec.physical_shape();
const bool requires_tilization = layout != Layout::ROW_MAJOR;
const bool enable_borrow = !requires_padding and !requires_tilization and !force_disable_borrow;

Expand Down Expand Up @@ -415,7 +414,7 @@ Tensor convert_python_tensors_to_tt_tensors(

template <typename T>
owned_buffer::Buffer<T> create_row_major_owned_buffer(
owned_buffer::Buffer<T> owned_buffer, const ttnn::TensorSpec& tensor_spec, const bool legacy_output) {
owned_buffer::Buffer<T>&& owned_buffer, const ttnn::TensorSpec& tensor_spec, const bool legacy_output) {
TT_FATAL(
!tensor_spec.memory_config().is_sharded() or tensor_spec.memory_config().shard_spec.has_value(),
"Sharded tensors must have a shard spec when converting to tt tensors!");
Expand All @@ -432,7 +431,7 @@ owned_buffer::Buffer<T> create_row_major_owned_buffer(
auto physical_data = owned_buffer.get();

// See implementation for documentation
auto logical_data = tensor_impl::decode_tensor_data(physical_data, tensor_spec);
auto logical_data = tensor_impl::decode_tensor_data(std::move(physical_data), tensor_spec);

return owned_buffer::create(std::move(logical_data));
}
Expand All @@ -450,27 +449,27 @@ std::variant<OwnedBuffer, BorrowedBuffer> get_host_buffer_from_tensor(
switch (tt_dtype) {
case DataType::UINT8: {
return create_row_major_owned_buffer(
owned_buffer::get_as<uint8_t>(storage.buffer), tensor_spec, legacy_output);
std::move(owned_buffer::get_as<uint8_t>(storage.buffer)), tensor_spec, legacy_output);
}
case DataType::UINT16: {
return create_row_major_owned_buffer(
owned_buffer::get_as<uint16_t>(storage.buffer), tensor_spec, legacy_output);
std::move(owned_buffer::get_as<uint16_t>(storage.buffer)), tensor_spec, legacy_output);
}
case DataType::INT32: {
return create_row_major_owned_buffer(
owned_buffer::get_as<int32_t>(storage.buffer), tensor_spec, legacy_output);
std::move(owned_buffer::get_as<int32_t>(storage.buffer)), tensor_spec, legacy_output);
}
case DataType::UINT32: {
return create_row_major_owned_buffer(
owned_buffer::get_as<uint32_t>(storage.buffer), tensor_spec, legacy_output);
std::move(owned_buffer::get_as<uint32_t>(storage.buffer)), tensor_spec, legacy_output);
}
case DataType::FLOAT32: {
return create_row_major_owned_buffer(
owned_buffer::get_as<float>(storage.buffer), tensor_spec, legacy_output);
std::move(owned_buffer::get_as<float>(storage.buffer)), tensor_spec, legacy_output);
}
case DataType::BFLOAT16: {
return create_row_major_owned_buffer(
owned_buffer::get_as<::bfloat16>(storage.buffer), tensor_spec, legacy_output);
std::move(owned_buffer::get_as<::bfloat16>(storage.buffer)), tensor_spec, legacy_output);
}
case DataType::BFLOAT8_B:
case DataType::BFLOAT4_B: {
Expand All @@ -483,7 +482,7 @@ std::variant<OwnedBuffer, BorrowedBuffer> get_host_buffer_from_tensor(
: unpack_bfp4_tiles_into_float_vec(
uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile);
auto input_float_buffer = owned_buffer::create<float>(std::move(float_unpacked_data));
return create_row_major_owned_buffer(input_float_buffer, tensor_spec, legacy_output);
return create_row_major_owned_buffer(std::move(input_float_buffer), tensor_spec, legacy_output);
}
default: {
TT_THROW("Unsupported DataType: {}", tt_dtype);
Expand Down
6 changes: 6 additions & 0 deletions ttnn/cpp/ttnn/tensor/layout/tensor_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,12 @@ Size TensorLayout::get_physical_shard_shape() const {
}
}

Size TensorLayout::compute_logical_2d_shape(const ttnn::SimpleShape& shape) const {
size_t width = shape[-1];
size_t height = shape.volume() / width;
return Size{height, width};
}

Size TensorLayout::compute_physical_shape(const ttnn::SimpleShape& shape) const {
const int rank = static_cast<int>(shape.rank());
const int alignment_rank = static_cast<int>(alignment_.size());
Expand Down
5 changes: 5 additions & 0 deletions ttnn/cpp/ttnn/tensor/layout/tensor_layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ class TensorLayout {
[[deprecated("Use of LegacyPaddedShape is deprecated. Please use get_physical_size() or get_strides() instead.")]]
ttnn::SimpleShape compute_padded_shape(const ttnn::SimpleShape& shape) const;

// Flattens input shape into height and width
// - Height is accumulated over all dims except last
// - Width is equal to the last dim
Size compute_logical_2d_shape(const ttnn::SimpleShape& shape) const;

// Returns number of elements laid out in physically memory across H:W dimensions
// W is row width aligned to page width and shard width, depends on data type
// H is all dimensions except W multiplied and aligned to tile and shard height
Expand Down
2 changes: 1 addition & 1 deletion ttnn/cpp/ttnn/tensor/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,7 @@ std::vector<float> Tensor::to_vector<float>() const {
: unpack_bfp4_tiles_into_float_vec(
packed_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile);

return tensor_impl::decode_tensor_data(unpacked_data, cpu_tensor.tensor_spec());
return tensor_impl::decode_tensor_data(std::move(unpacked_data), cpu_tensor.tensor_spec());
}
default: {
TT_THROW("Cannot convert tensor to vector for data type: {}", cpu_tensor.get_dtype());
Expand Down
Loading

0 comments on commit db1d0cb

Please sign in to comment.