Skip to content

Commit

Permalink
Add support for block sharding.
Browse files Browse the repository at this point in the history
	ToDo: commonize code.

Signed-off-by: Nilaykumar Patel <nkpatel@tenstorrent.com>
  • Loading branch information
nkpatel-tt committed Dec 10, 2024
1 parent 8890e70 commit e39f225
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 50 deletions.
6 changes: 3 additions & 3 deletions tests/ttnn/unit_tests/operations/test_upsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ def test_upsample_single_core(device, input_shapes, scale_h, scale_w):
[5, 64, 5, 5],
[1, 128, 5, 8],
[1, 32, 5, 4],
[7, 64, 128, 17],
[3, 64, 132, 19],
[1, 64, 128, 17],
[1, 64, 132, 19],
],
)
@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
Expand All @@ -132,7 +132,7 @@ def test_upsample_multi_core(device, input_shape, scale_h, scale_w, shard_strate
# for j in range(input_shape[1]):
# for k in range(input_shape[2]):
# for l in range(input_shape[3]):
# input[i, j, k, l] = k * width + l + 1
# input[i, j, k, l] = (k * width + l + 1)

## golden reference using torch
scale_factor = (scale_h, scale_w)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <vector>

#include "buffers/buffer_constants.hpp"
#include "common/assert.hpp"
#include "common/core_coord.hpp"
#include "ttnn/tensor/host_buffer/functions.hpp"

Expand All @@ -20,37 +21,28 @@ using namespace tt::tt_metal;
namespace ttnn::operations::upsample {
using namespace tt;

Tensor create_config_tensor(
Tensor create_config_tensor_height_sharded(
Device *device,
ShardSpec &input_shard_spec,
uint32_t input_nsticks_per_core,
const uint32_t batch_size,
const uint32_t in_h,
const uint32_t in_w,
const uint32_t scale_factor_h,
const uint32_t scale_factor_w,
TensorMemoryLayout shard_scheme,
uint32_t ncores_nhw,
uint32_t ncores_x) {
const uint32_t scale_factor_w) {
std::vector<uint16_t> config_vector;
uint32_t input_nsticks_per_core = input_shard_spec.shape[0];
uint32_t ncores_x = device->compute_with_storage_grid_size().x;
uint32_t in_core = 0;
uint32_t w = 0;
uint32_t curr_stick = 0;
if(shard_scheme == TensorMemoryLayout::HEIGHT_SHARDED) {
ncores_x = 1;
ncores_nhw = 1;
}
uint32_t physical_core_x = device->compute_with_storage_grid_size().x;

auto core_coords = device->worker_core_from_logical_core(CoreCoord(in_core % physical_core_x, in_core / physical_core_x));
auto core_coords = device->worker_core_from_logical_core(CoreCoord(in_core % ncores_x, in_core / ncores_x));
for (uint32_t b = 0; b < batch_size; b++) {
for (uint32_t h = 0; h < in_h; h++) {
for (uint32_t w = 0; w < in_w; w++) {
if (curr_stick == input_nsticks_per_core) {
curr_stick = 0;
in_core++;
core_coords =
device->worker_core_from_logical_core(CoreCoord(0, in_core));
device->worker_core_from_logical_core(CoreCoord(in_core % ncores_x, in_core / ncores_x));
}
config_vector.insert(config_vector.end(), {core_coords.x, core_coords.y, curr_stick, 0});
curr_stick++;
Expand All @@ -59,27 +51,50 @@ Tensor create_config_tensor(
config_vector.insert(config_vector.end(), config_vector.end() - (4 * in_w), config_vector.end());
}
}
// Copy for y direction
std::vector<uint16_t> temp_config_vector;
/*auto prev_idx = 0;*/
/*auto idx = 0;*/
/*for(uint32_t i = 0; i < ncores_nhw; i++) {*/
/* idx = 4 * (i+1) * input_nsticks_per_core * scale_factor_h;*/
/* for(uint32_t j = 0; j < ncores_x; j++) {*/
/* temp_config_vector.insert(temp_config_vector.end(), config_vector.begin() + prev_idx, config_vector.begin() + idx);*/
/* }*/
/* prev_idx = idx;*/
/*}*/
for(uint32_t i = 0; i < ncores_x; i++) {
/*TODO: Change take core x into considereation.*/
temp_config_vector.insert(temp_config_vector.end(), config_vector.begin(), config_vector.end());
}

uint32_t elems_per_core = 4 * scale_factor_h * input_nsticks_per_core;
Shape config_shape = Shape({config_vector.size() / elems_per_core, elems_per_core});
auto config_buffer = owned_buffer::create<uint16_t>(std::move(config_vector));
Tensor config_tensor = Tensor(OwnedStorage{config_buffer}, config_shape, DataType::UINT16, Layout::ROW_MAJOR);
return config_tensor;
}

Tensor create_config_tensor_block_sharded(
Device *device,
uint32_t input_nsticks_per_core,
const uint32_t batch_size,
const uint32_t in_h,
const uint32_t in_w,
const uint32_t scale_factor_h,
const uint32_t scale_factor_w,
uint32_t ncores_x) {
std::vector<uint16_t> config_vector;
uint32_t in_core = 0;
uint32_t w = 0;
uint32_t curr_stick = 0;

CoreCoord core_coords;
for (uint32_t b = 0; b < batch_size; b++) {
for (uint32_t h = 0; h < in_h; h++) {
for (uint32_t w = 0; w < in_w; w++) {
if (curr_stick == input_nsticks_per_core) {
curr_stick = 0;
in_core++;
}
config_vector.insert(config_vector.end(), {in_core, curr_stick});
curr_stick++;
}
for (uint32_t j = 0; j < scale_factor_h - 1; j++)
config_vector.insert(config_vector.end(), config_vector.end() - (2 * in_w), config_vector.end());
}
}
std::vector<uint16_t> temp_config_vector;

using namespace std;
uint32_t core = 0;
for(auto i = 0; i < temp_config_vector.size(); i+=4) {
cout << temp_config_vector[i] << " " << temp_config_vector[i+1] << " " << temp_config_vector[i+2] << " " << temp_config_vector[i+3] << endl;
for(uint32_t i = 0; i < ncores_x; i++) {
for(uint32_t j = 0; j < config_vector.size(); j+=2) {
core_coords = device->worker_core_from_logical_core(CoreCoord(i, config_vector[j]));
temp_config_vector.insert(temp_config_vector.end(), {core_coords.x, core_coords.y, config_vector[j+1], 0});
}
}
uint32_t elems_per_core = 4 * scale_factor_h * input_nsticks_per_core;
Shape config_shape = Shape({temp_config_vector.size() / elems_per_core, elems_per_core});
Expand Down Expand Up @@ -172,24 +187,34 @@ operation::ProgramWithCallbacks upsample_multi_core(const Tensor &input, Tensor&
log_debug(LogOp, "input_nsticks_per_core: {}, output_nsticks_per_core: {}", input_nsticks_per_core, output_nsticks_per_core);

// create config tensor
Tensor config_tensor = create_config_tensor(
device,
shard_spec,
input.legacy_shape()[0],
input.legacy_shape()[1],
in_w,
scale_factor_h,
scale_factor_w,
input.memory_config().memory_layout,
ncores_nhw,
ncores_x);
config_tensor.print();
Tensor config_tensor;
if(input.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
config_tensor = create_config_tensor_block_sharded(
device,
shard_spec.shape[0],
input.legacy_shape()[0],
input.legacy_shape()[1],
in_w,
scale_factor_h,
scale_factor_w,
ncores_x);
} else if (input.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) {
config_tensor = create_config_tensor_height_sharded(
device,
shard_spec.shape[0],
input.legacy_shape()[0],
input.legacy_shape()[1],
in_w,
scale_factor_h,
scale_factor_w);
} else {
TT_THROW("Unsupported sharding layout");
}
auto shard_shape = std::array<uint32_t, 2>({1, (uint32_t)config_tensor.get_shape()[-1]});
auto config_tensor_shard_orientation = input.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED ? (shard_spec.orientation == ShardOrientation::COL_MAJOR ? ShardOrientation::ROW_MAJOR : ShardOrientation::COL_MAJOR) : ShardOrientation::ROW_MAJOR;
ShardSpec config_shard_spec(input.shard_spec().value().grid, shard_shape, config_tensor_shard_orientation, false);
MemoryConfig memory_config{input.memory_config().memory_layout, BufferType::L1_SMALL, config_shard_spec};
auto config_tensor_device = config_tensor.to(device, memory_config);
config_tensor_device.print();
tt::tt_metal::detail::AddConfigBuffer(program, config_tensor_device.device_buffer());

tt::DataFormat config_df = tt::DataFormat::RawUInt16;
Expand Down

0 comments on commit e39f225

Please sign in to comment.