Skip to content

Commit

Permalink
#14245: Fix the L1 memory alignment issue caused by DRAM-L1 memory co…
Browse files Browse the repository at this point in the history
…py. Added the flag in mem_config to indicate memory alignment in L1 (#15415)

### Ticket
Link to Github Issue
#14245

### Problem description
memory copy from DRAM to L1 is not well aligned to 16B, when the shard
length is not 32B aligned but 16B aligned, there are gaps in L1.

### What's changed
Added flag in memory_config data structure to indicate the alignment L1;
Use the flag to decide the alignment in L1 for i2s and s2i ops. The
update can also address issues for some ops that modifies the tensor
size in L1 such as maxpooling, dilution.

### Checklist
- [x] Post commit CI passes
https://github.com/tenstorrent/tt-metal/actions/runs/12110638689
- [x] Blackhole Post commit (if applicable)
https://github.com/tenstorrent/tt-metal/actions/runs/12110643661
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] New/Existing tests provide coverage for changes

---------

Co-authored-by: wransom-TT <wransom@tenstorrent.com>
  • Loading branch information
llongTT and wransom-TT authored Dec 4, 2024
1 parent d0a192d commit 8a2a1f8
Show file tree
Hide file tree
Showing 16 changed files with 68 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ std::vector<Tensor> InterleavedToShardedDeviceOperation::create_output_tensors(c
operation::ProgramWithCallbacks InterleavedToShardedDeviceOperation::create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const {
const auto& input_tensor = input_tensors.at(0);
auto& output_tensor = output_tensors.at(0);
return detail::interleaved_to_sharded_multi_core(input_tensor, output_tensor);
return detail::interleaved_to_sharded_multi_core(input_tensor, output_tensor, this->keep_l1_aligned);
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ namespace ttnn::operations::data_movement {
struct InterleavedToShardedDeviceOperation {
const tt::tt_metal::MemoryConfig output_mem_config;
const tt::tt_metal::DataType output_dtype;
const bool keep_l1_aligned = false;

void validate(const std::vector<Tensor>& input_tensors) const;
std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor>& input_tensors) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ using namespace tt::tt_metal;
namespace ttnn::operations::data_movement::detail {

operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
const Tensor& input, const Tensor& output, uint32_t num_slices, uint32_t slice_index) {
const Tensor& input, const Tensor& output, bool keep_l1_aligned, uint32_t num_slices, uint32_t slice_index) {
tt::tt_metal::Program program{};

uint32_t num_units, num_units_per_shard, input_unit_size, output_unit_size, num_units_per_shard_width,
Expand Down Expand Up @@ -71,7 +71,13 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
// TODO: Use a different variable name. Units refers to pages, but this is being used as size
num_units_per_shard_width_last =
input_unit_size - (tt::round_up(num_units_per_row, input_unit_size) - num_units_per_row);
padded_offset_bytes = align(input_unit_size, input.buffer()->alignment());
//Adjust accordingly to l1 alignment, do it for all archs
if(keep_l1_aligned){
padded_offset_bytes = align(input_unit_size, hal.get_alignment(HalMemType::L1));
}
else {
padded_offset_bytes = align(input_unit_size, input.buffer()->alignment());
}
}


Expand All @@ -95,7 +101,7 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
.set_globally_allocated_address(*output.buffer());
auto cb_output = tt::tt_metal::CreateCircularBuffer(program, all_cores, output_cb_out_config);
uint32_t dram_alignment = hal.get_alignment(HalMemType::DRAM);
if (src_is_dram && input_unit_size % dram_alignment != 0 or is_blackhole) {
if (src_is_dram && input_unit_size % dram_alignment != 0 or is_blackhole or keep_l1_aligned) {
uint32_t scratch_cb_page_size;
//scratchpad going to be used to align DRAM (64B) to L1 (16B)
if (is_blackhole) {
Expand Down Expand Up @@ -246,7 +252,8 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
uint32_t dram_alignment = hal.get_alignment(HalMemType::DRAM);
uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
bool aligned = (src_is_dram ? curr_idx_w % dram_alignment == 0 : true);
aligned = aligned and !(is_blackhole);
//for blackhole and keep_l1_aligned cases, always enforce unaligned kernel call
aligned = aligned and !(is_blackhole) and !(keep_l1_aligned);
uint32_t aligned_width_offset, aligned_shard_width, aligned_offset;
if (!aligned) {
//TODO: is this right, leaving non BH case the same for now, should investigate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@
namespace ttnn::operations::data_movement::detail {


operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(const Tensor &a, const Tensor &output, uint32_t num_slices = 1, uint32_t slice_index = 0);
operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(const Tensor &a, const Tensor &output, bool keep_l1_aligned = false, uint32_t num_slices = 1, uint32_t slice_index = 0);

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@ ttnn::Tensor InterleavedToShardedOperation::invoke(
uint8_t queue_id,
const ttnn::Tensor& input_tensor,
const MemoryConfig& sharded_memory_config,
const std::optional<DataType>& data_type_arg) {
const std::optional<DataType>& data_type_arg,
const std::optional<bool>& keep_l1_aligned) {
return operation::run(
InterleavedToShardedDeviceOperation{
.output_mem_config = sharded_memory_config,
.output_dtype = data_type_arg.value_or(input_tensor.get_dtype())},
.output_dtype = data_type_arg.value_or(input_tensor.get_dtype()),
.keep_l1_aligned = keep_l1_aligned.value_or(false)},
{input_tensor})
.at(0);
}
Expand All @@ -32,7 +34,8 @@ ttnn::Tensor InterleavedToShardedOperation::invoke(
const std::array<uint32_t, 2> shard_shape,
const TensorMemoryLayout shard_scheme,
const ShardOrientation shard_orientation,
const std::optional<DataType>& data_type_arg) {
const std::optional<DataType>& data_type_arg,
const std::optional<bool>& keep_l1_aligned) {
bool row_wise = shard_orientation == ShardOrientation::ROW_MAJOR;
CoreCoord grid_size;
CoreRangeSet grid_set;
Expand Down Expand Up @@ -69,7 +72,8 @@ ttnn::Tensor InterleavedToShardedOperation::invoke(
return operation::run(
InterleavedToShardedDeviceOperation{
.output_mem_config = sharded_mem_config,
.output_dtype = data_type_arg.value_or(input_tensor.get_dtype())},
.output_dtype = data_type_arg.value_or(input_tensor.get_dtype()),
.keep_l1_aligned = keep_l1_aligned.value_or(false)},
{input_tensor})
.at(0);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,17 @@ struct InterleavedToShardedOperation {
uint8_t queue_id,
const ttnn::Tensor& input_tensor,
const MemoryConfig& sharded_memory_config,
const std::optional<DataType>& data_type_arg);
const std::optional<DataType>& data_type_arg,
const std::optional<bool>& keep_l1_aligned = std::nullopt);
static ttnn::Tensor invoke(
uint8_t queue_id,
const ttnn::Tensor& input_tensor,
const std::variant<CoreCoord, CoreRangeSet>& grid,
const std::array<uint32_t, 2> shard_shape,
const TensorMemoryLayout shard_scheme,
const ShardOrientation shard_orientation,
const std::optional<DataType>& data_type_arg);
const std::optional<DataType>& data_type_arg,
const std::optional<bool>& keep_l1_aligned = std::nullopt);
};

} // namespace operations::data_movement
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,17 @@ void bind_interleaved_to_sharded(
tt::tt_metal::TensorMemoryLayout shard_scheme,
tt::tt_metal::ShardOrientation shard_orientation,
const std::optional<ttnn::DataType>& output_dtype,
uint8_t queue_id) -> ttnn::Tensor {
return self(queue_id, input_tensor, grid, shard_shape, shard_scheme, shard_orientation, output_dtype);
uint8_t queue_id,
const std::optional<bool>& keep_l1_aligned) -> ttnn::Tensor {
return self(
queue_id,
input_tensor,
grid,
shard_shape,
shard_scheme,
shard_orientation,
output_dtype,
keep_l1_aligned);
},
py::arg("input_tensor").noconvert(),
py::arg("grid"),
Expand All @@ -40,21 +49,24 @@ void bind_interleaved_to_sharded(
py::arg("output_dtype") = std::nullopt,
py::kw_only(),
py::arg("queue_id") = 0,
py::arg("keep_l1_aligned") = false,

},
ttnn::pybind_overload_t{
[](const data_movement_sharded_operation_t& self,
const ttnn::Tensor& input_tensor,
const MemoryConfig& sharded_memory_config,
const std::optional<ttnn::DataType>& output_dtype,
uint8_t queue_id) -> ttnn::Tensor {
return self(queue_id, input_tensor, sharded_memory_config, output_dtype);
uint8_t queue_id,
const std::optional<bool>& keep_l1_aligned) -> ttnn::Tensor {
return self(queue_id, input_tensor, sharded_memory_config, output_dtype, keep_l1_aligned);
},
py::arg("input_tensor").noconvert(),
py::arg("sharded_memory_config"),
py::arg("output_dtype") = std::nullopt,
py::kw_only(),
py::arg("queue_id") = 0,
py::arg("keep_l1_aligned") = false,

});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ operation::ProgramWithCallbacks ShardedToInterleavedDeviceOperation::create_prog
const std::vector<Tensor>& input_tensors, std::vector<Tensor>& output_tensors) const {
const auto& input_tensor = input_tensors.at(0);
auto& output_tensor = output_tensors.at(0);
return detail::sharded_to_interleaved_multi_core(input_tensor, output_tensor);
return detail::sharded_to_interleaved_multi_core(input_tensor, output_tensor, this->is_l1_aligned);
}

} // namespace ttnn::operations::data_movement
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ namespace ttnn::operations::data_movement {
struct ShardedToInterleavedDeviceOperation {
const tt::tt_metal::MemoryConfig output_mem_config;
const tt::tt_metal::DataType output_dtype;
const bool is_l1_aligned = false;

void validate(const std::vector<Tensor>& input_tensors) const;
std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor>& input_tensors) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ using namespace tt::tt_metal;
namespace ttnn::operations::data_movement::detail {

operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
const Tensor& input, const Tensor& output, uint32_t num_slices, uint32_t slice_index) {
const Tensor& input, const Tensor& output, bool is_l1_aligned, uint32_t num_slices, uint32_t slice_index) {
tt_metal::Program program{};

uint32_t num_units, num_units_per_shard, input_unit_size, output_unit_size, num_units_per_shard_width,
Expand Down Expand Up @@ -235,8 +235,8 @@ operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
uint32_t dram_alignment = hal.get_alignment(HalMemType::DRAM);
uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
uint32_t padded_shard_width = align(output_unit_size, dst_buffer->alignment());
if(is_blackhole) {
if(!dst_is_dram)
if(is_blackhole or is_l1_aligned) {
if(!dst_is_dram or is_l1_aligned)
padded_shard_width = align(output_unit_size, l1_alignment);
}
tt_metal::SetRuntimeArgs(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
namespace ttnn::operations::data_movement::detail {

operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
const Tensor& a, const Tensor& output, uint32_t num_slices = 1, uint32_t slice_index = 0);

const Tensor& a,
const Tensor& output,
bool is_l1_aligned = false,
uint32_t num_slices = 1,
uint32_t slice_index = 0);
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@ ttnn::Tensor ShardedToInterleavedOperation::invoke(
uint8_t queue_id,
const ttnn::Tensor& input_tensor,
const MemoryConfig& memory_config,
const std::optional<DataType>& output_dtype) {
const std::optional<DataType>& output_dtype,
const std::optional<bool>& is_l1_aligned) {
std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};

auto shard_spec = input_tensor.shard_spec().value();
TT_FATAL(input_tensor.shard_spec().has_value(), "Error");
return operation::run(
ShardedToInterleavedDeviceOperation{
.output_mem_config = memory_config, .output_dtype = output_dtype.value_or(input_tensor.get_dtype())},
.output_mem_config = memory_config,
.output_dtype = output_dtype.value_or(input_tensor.get_dtype()),
.is_l1_aligned = is_l1_aligned.value_or(false)},
{input_tensor})
.at(0);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ struct ShardedToInterleavedOperation {
uint8_t queue_id,
const ttnn::Tensor& input_tensor,
const MemoryConfig& memory_config,
const std::optional<DataType>& output_dtype);
const std::optional<DataType>& output_dtype,
const std::optional<bool>& is_l1_aligned = std::nullopt);
};

} // namespace operations::data_movement
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,21 @@ void bind_sharded_to_interleaved(
const ttnn::Tensor& input_tensor,
const std::optional<MemoryConfig>& memory_config,
const std::optional<DataType>& output_dtype,
uint8_t queue_id) -> ttnn::Tensor {
uint8_t queue_id,
const std::optional<bool>& is_l1_aligned) -> ttnn::Tensor {
return self(
queue_id,
input_tensor,
memory_config.value_or(operation::DEFAULT_OUTPUT_MEMORY_CONFIG),
output_dtype);
output_dtype,
is_l1_aligned);
},
py::arg("input_tensor").noconvert(),
py::arg("memory_config") = std::nullopt,
py::arg("output_dtype") = std::nullopt,
py::kw_only(),
py::arg("queue_id") = 0,
py::arg("is_l1_aligned") = false,
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ operation::ProgramWithCallbacks InterleavedToShardedPartialDeviceOperation::crea
const auto& input_tensor = input_tensors.at(0);
auto& output_tensor = output_tensors.at(0);
// Will move with sharded ops
return detail::interleaved_to_sharded_multi_core(input_tensor, output_tensor, this->num_slices, this->slice_index);
return detail::interleaved_to_sharded_multi_core(
input_tensor, output_tensor, false, this->num_slices, this->slice_index);
}

} // namespace ttnn::operations::data_movement
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ operation::ProgramWithCallbacks ShardedToInterleavedPartialDeviceOperation::crea
const auto& input_tensor = input_tensors.at(0);
auto& output_tensor = input_tensors[1];
// Will move with sharded ops
return detail::sharded_to_interleaved_multi_core(input_tensor, output_tensor, this->num_slices, this->slice_index);
return detail::sharded_to_interleaved_multi_core(
input_tensor, output_tensor, false, this->num_slices, this->slice_index);
}

} // namespace ttnn::operations::data_movement

0 comments on commit 8a2a1f8

Please sign in to comment.