Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#14366: Allow creating pre-allocated buffer with an address. Minor perf improvements #14394

Merged
merged 6 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions tt_metal/host_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,21 +255,45 @@ uint32_t CreateSemaphore(
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------------|---------------------------------------- |--------------------------|-------------|----------|
* | config | config for buffer | InterleavedBufferConfig | | Yes |
* | config | Config for the buffer | InterleavedBufferConfig | | Yes |
*/
std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config);

/**
* Creates a pre-allocated interleaved DRAM or L1 buffer on device
*
* Return value: std::shared_ptr<Buffer>
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------------|---------------------------------------- |--------------------------|-------------|----------|
* | config | Config for the buffer | InterleavedBufferConfig | | Yes |
* | address | Device address of the buffer | DeviceAddr | | Yes |
*/
std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config, DeviceAddr address);

/**
* Allocates a sharded DRAM or L1 buffer on device
*
* Return value: std::shared_ptr<Buffer>
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------------|---------------------------------------- |--------------------------|-------------|----------|
* | config | config for buffer | ShardedBufferConfig | | Yes |
* | config | Config for the buffer | ShardedBufferConfig | | Yes |
*/
std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config);

/**
* Creates a pre-allocated sharded DRAM or L1 buffer on device
*
* Return value: std::shared_ptr<Buffer>
*
* | Argument | Description | Type | Valid Range | Required |
* |-----------------|---------------------------------------- |--------------------------|-------------|----------|
* | config | Config for the buffer | ShardedBufferConfig | | Yes |
* | address | Device address of the buffer | DeviceAddr | | Yes |
*/
std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config, DeviceAddr address);

/**
* Deallocates buffer from device by marking its memory as free.
*
Expand Down
49 changes: 44 additions & 5 deletions tt_metal/impl/buffers/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,14 +203,17 @@ Buffer::Buffer(
const BufferType buffer_type,
const TensorMemoryLayout buffer_layout,
const std::optional<ShardSpecBuffer>& shard_parameters,
const std::optional<bool> bottom_up) :
const std::optional<bool> bottom_up,
const bool owns_data,
Private) :
device_(device),
size_(size),
page_size_(page_size),
buffer_type_(buffer_type),
buffer_layout_(buffer_layout),
shard_parameters_(shard_parameters),
bottom_up_(bottom_up.value_or(this->is_dram())),
owns_data_(owns_data),
buffer_page_mapping_(nullptr) {
TT_FATAL(this->device_ != nullptr && this->device_->allocator_ != nullptr, "Device and allocator need to not be null.");

Expand All @@ -227,7 +230,8 @@ std::shared_ptr<Buffer> Buffer::create(
const TensorMemoryLayout buffer_layout,
const std::optional<ShardSpecBuffer>& shard_parameters,
const std::optional<bool> bottom_up) {
auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up);
auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, true /* owns data */, Private());
// Using a custom deleter to properly clean up the owned datas
auto buffer = std::shared_ptr<Buffer>(bufferPtr, deleter);
buffer->weak_self = buffer;

Expand All @@ -237,19 +241,50 @@ std::shared_ptr<Buffer> Buffer::create(
}

buffer->device_->push_work([buffer] {
buffer->address_ = detail::AllocateBuffer(buffer.get());
try {
buffer->address_ = detail::AllocateBuffer(buffer.get());
} catch(...) {
std::unique_lock lock(buffer->allocation_mutex_);
buffer->allocation_status_.store(AllocationStatus::ALLOCATION_FAILED, std::memory_order::relaxed);
lock.unlock();
buffer->allocation_cv_.notify_all();

throw;
}

std::unique_lock lock(buffer->allocation_mutex_);
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::release);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this really need to be made release? It seems like we're already protected by a lock.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. In Buffer::address() the first check is allocation_status_.load(std::memory_order::acquire) != AllocationStatus::ALLOCATION_REQUESTED performed without a lock. This is done this way for performance optimization reasons. The only two places who capture a lock areBuffer::create when allocating and Buffer::address when waiting on an allocation using condition variable.

lock.unlock();
buffer->allocation_cv_.notify_all();
});

return buffer;
}

std::shared_ptr<Buffer> Buffer::create(
Device *device,
DeviceAddr address,
DeviceAddr size,
DeviceAddr page_size,
const BufferType buffer_type,
const TensorMemoryLayout buffer_layout,
const std::optional<ShardSpecBuffer>& shard_parameters,
const std::optional<bool> bottom_up) {
// Not using a custom deleter, because it doesn't own any data to cleanup
auto buffer = std::make_shared<Buffer>(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, false /* owns data */, Private());
buffer->weak_self = buffer;

buffer->address_ = address;
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);

return buffer;
}

void Buffer::deallocate() {
deallocation_requested_.store(true, std::memory_order::relaxed);
if (!owns_data_) {
return;
}
device_->push_work([self = weak_self.lock()] {
self->deallocate_impl();
});
Expand All @@ -263,7 +298,7 @@ void Buffer::deleter(Buffer* buffer) {
}

void Buffer::deallocate_impl() {
if (allocation_status_.load(std::memory_order::relaxed) == AllocationStatus::DEALLOCATED) {
if (allocation_status_.load(std::memory_order::relaxed) != AllocationStatus::ALLOCATED) {
return;
}

Expand All @@ -289,6 +324,10 @@ bool Buffer::is_allocated() const {
}

uint32_t Buffer::address() const {
if (allocation_status_.load(std::memory_order::acquire) != AllocationStatus::ALLOCATION_REQUESTED) {
return address_;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optimization for the happy case, no waiting or other checks if buffer is already allocated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense :)

}

if (device_->can_use_passthrough_scheduling()) {
return address_;
}
Expand Down
20 changes: 18 additions & 2 deletions tt_metal/impl/buffers/buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ struct BufferPageMapping {
inline namespace v0 {

class Buffer final {
struct Private { explicit Private() = default; };
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Private structure is used to limit access to the Buffer constructor, which can't be truly private, because of std::make_shared. This is the way recommended by CppReference documentation: https://en.cppreference.com/w/cpp/memory/enable_shared_from_this (see Best example)


public:
static std::shared_ptr<Buffer> create(
Device *device,
Expand All @@ -153,6 +155,15 @@ class Buffer final {
TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED,
const std::optional<ShardSpecBuffer>& shard_parameter = std::nullopt,
std::optional<bool> bottom_up = std::nullopt);
static std::shared_ptr<Buffer> create(
Device *device,
DeviceAddr address,
DeviceAddr size,
DeviceAddr page_size,
BufferType buffer_type,
TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED,
const std::optional<ShardSpecBuffer>& shard_parameter = std::nullopt,
std::optional<bool> bottom_up = std::nullopt);

Buffer(const Buffer &other) = delete;
Buffer &operator=(const Buffer &other) = delete;
Expand Down Expand Up @@ -210,18 +221,22 @@ class Buffer final {

const std::shared_ptr<const BufferPageMapping>& get_buffer_page_mapping();

private:

Buffer(
Device *device,
DeviceAddr size,
DeviceAddr page_size,
BufferType buffer_type,
TensorMemoryLayout buffer_layout,
const std::optional<ShardSpecBuffer>& shard_parameter,
std::optional<bool> bottom_up);
std::optional<bool> bottom_up,
bool owns_data,
Private);

private:
enum class AllocationStatus : uint8_t {
ALLOCATION_REQUESTED,
ALLOCATION_FAILED,
ALLOCATED,
DEALLOCATED,
};
Expand All @@ -239,6 +254,7 @@ class Buffer final {
const BufferType buffer_type_;
const TensorMemoryLayout buffer_layout_;
const bool bottom_up_;
const bool owns_data_;

std::atomic<AllocationStatus> allocation_status_ = AllocationStatus::ALLOCATION_REQUESTED;
DeviceAddr address_ = 0;
Expand Down
17 changes: 17 additions & 0 deletions tt_metal/tt_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,11 @@ std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config) {
config.device, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt);
}

std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config, DeviceAddr address) {
return Buffer::create(
config.device, address, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt);
}

std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config) {
return Buffer::create(
config.device,
Expand All @@ -1101,6 +1106,18 @@ std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config) {
std::nullopt);
}

std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config, DeviceAddr address) {
return Buffer::create(
config.device,
address,
config.size,
config.page_size,
config.buffer_type,
config.buffer_layout,
config.shard_parameters,
std::nullopt);
}

void DeallocateBuffer(Buffer &buffer) { buffer.deallocate(); }

void AssignGlobalBufferToProgram(
Expand Down
Loading