-
Notifications
You must be signed in to change notification settings - Fork 96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
#14366: Allow creating pre-allocated buffer with an address. Minor perf improvements #14394
Changes from 5 commits
8cc5fa5
941ed4c
ba1e84b
b85798a
ed11135
eb9caa7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -203,14 +203,17 @@ Buffer::Buffer( | |
const BufferType buffer_type, | ||
const TensorMemoryLayout buffer_layout, | ||
const std::optional<ShardSpecBuffer>& shard_parameters, | ||
const std::optional<bool> bottom_up) : | ||
const std::optional<bool> bottom_up, | ||
const bool owns_data, | ||
Private) : | ||
device_(device), | ||
size_(size), | ||
page_size_(page_size), | ||
buffer_type_(buffer_type), | ||
buffer_layout_(buffer_layout), | ||
shard_parameters_(shard_parameters), | ||
bottom_up_(bottom_up.value_or(this->is_dram())), | ||
owns_data_(owns_data), | ||
buffer_page_mapping_(nullptr) { | ||
TT_FATAL(this->device_ != nullptr && this->device_->allocator_ != nullptr, "Device and allocator need to not be null."); | ||
|
||
|
@@ -227,7 +230,8 @@ std::shared_ptr<Buffer> Buffer::create( | |
const TensorMemoryLayout buffer_layout, | ||
const std::optional<ShardSpecBuffer>& shard_parameters, | ||
const std::optional<bool> bottom_up) { | ||
auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up); | ||
auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, true /* owns data */, Private()); | ||
// Using a custom deleter to properly clean up the owned datas | ||
auto buffer = std::shared_ptr<Buffer>(bufferPtr, deleter); | ||
buffer->weak_self = buffer; | ||
|
||
|
@@ -237,19 +241,50 @@ std::shared_ptr<Buffer> Buffer::create( | |
} | ||
|
||
buffer->device_->push_work([buffer] { | ||
buffer->address_ = detail::AllocateBuffer(buffer.get()); | ||
try { | ||
buffer->address_ = detail::AllocateBuffer(buffer.get()); | ||
} catch(...) { | ||
std::unique_lock lock(buffer->allocation_mutex_); | ||
buffer->allocation_status_.store(AllocationStatus::ALLOCATION_FAILED, std::memory_order::relaxed); | ||
lock.unlock(); | ||
buffer->allocation_cv_.notify_all(); | ||
|
||
throw; | ||
} | ||
|
||
std::unique_lock lock(buffer->allocation_mutex_); | ||
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed); | ||
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::release); | ||
lock.unlock(); | ||
buffer->allocation_cv_.notify_all(); | ||
}); | ||
|
||
return buffer; | ||
} | ||
|
||
std::shared_ptr<Buffer> Buffer::create( | ||
Device *device, | ||
DeviceAddr address, | ||
DeviceAddr size, | ||
DeviceAddr page_size, | ||
const BufferType buffer_type, | ||
const TensorMemoryLayout buffer_layout, | ||
const std::optional<ShardSpecBuffer>& shard_parameters, | ||
const std::optional<bool> bottom_up) { | ||
// Not using a custom deleter, because it doesn't own any data to cleanup | ||
auto buffer = std::make_shared<Buffer>(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, false /* owns data */, Private()); | ||
buffer->weak_self = buffer; | ||
|
||
buffer->address_ = address; | ||
buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed); | ||
|
||
return buffer; | ||
} | ||
|
||
void Buffer::deallocate() { | ||
deallocation_requested_.store(true, std::memory_order::relaxed); | ||
if (!owns_data_) { | ||
return; | ||
} | ||
device_->push_work([self = weak_self.lock()] { | ||
self->deallocate_impl(); | ||
}); | ||
|
@@ -263,7 +298,7 @@ void Buffer::deleter(Buffer* buffer) { | |
} | ||
|
||
void Buffer::deallocate_impl() { | ||
if (allocation_status_.load(std::memory_order::relaxed) == AllocationStatus::DEALLOCATED) { | ||
if (allocation_status_.load(std::memory_order::relaxed) != AllocationStatus::ALLOCATED) { | ||
return; | ||
} | ||
|
||
|
@@ -289,6 +324,10 @@ bool Buffer::is_allocated() const { | |
} | ||
|
||
uint32_t Buffer::address() const { | ||
if (allocation_status_.load(std::memory_order::acquire) != AllocationStatus::ALLOCATION_REQUESTED) { | ||
return address_; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Optimization for the happy case, no waiting or other checks if buffer is already allocated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense :) |
||
} | ||
|
||
if (device_->can_use_passthrough_scheduling()) { | ||
return address_; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -144,6 +144,8 @@ struct BufferPageMapping { | |
inline namespace v0 { | ||
|
||
class Buffer final { | ||
struct Private { explicit Private() = default; }; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
|
||
public: | ||
static std::shared_ptr<Buffer> create( | ||
Device *device, | ||
|
@@ -153,6 +155,15 @@ class Buffer final { | |
TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED, | ||
const std::optional<ShardSpecBuffer>& shard_parameter = std::nullopt, | ||
std::optional<bool> bottom_up = std::nullopt); | ||
static std::shared_ptr<Buffer> create( | ||
Device *device, | ||
DeviceAddr address, | ||
DeviceAddr size, | ||
DeviceAddr page_size, | ||
BufferType buffer_type, | ||
TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED, | ||
const std::optional<ShardSpecBuffer>& shard_parameter = std::nullopt, | ||
std::optional<bool> bottom_up = std::nullopt); | ||
|
||
Buffer(const Buffer &other) = delete; | ||
Buffer &operator=(const Buffer &other) = delete; | ||
|
@@ -210,18 +221,22 @@ class Buffer final { | |
|
||
const std::shared_ptr<const BufferPageMapping>& get_buffer_page_mapping(); | ||
|
||
private: | ||
|
||
Buffer( | ||
Device *device, | ||
DeviceAddr size, | ||
DeviceAddr page_size, | ||
BufferType buffer_type, | ||
TensorMemoryLayout buffer_layout, | ||
const std::optional<ShardSpecBuffer>& shard_parameter, | ||
std::optional<bool> bottom_up); | ||
std::optional<bool> bottom_up, | ||
bool owns_data, | ||
Private); | ||
|
||
private: | ||
enum class AllocationStatus : uint8_t { | ||
ALLOCATION_REQUESTED, | ||
ALLOCATION_FAILED, | ||
ALLOCATED, | ||
DEALLOCATED, | ||
}; | ||
|
@@ -239,6 +254,7 @@ class Buffer final { | |
const BufferType buffer_type_; | ||
const TensorMemoryLayout buffer_layout_; | ||
const bool bottom_up_; | ||
const bool owns_data_; | ||
|
||
std::atomic<AllocationStatus> allocation_status_ = AllocationStatus::ALLOCATION_REQUESTED; | ||
DeviceAddr address_ = 0; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this really need to be made
release
? It seems like we're already protected by a lock.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. In
Buffer::address()
the first check isallocation_status_.load(std::memory_order::acquire) != AllocationStatus::ALLOCATION_REQUESTED
performed without a lock. This is done this way for performance optimization reasons. The only two places who capture a lock areBuffer::create
when allocating andBuffer::address
when waiting on an allocation using condition variable.