tenstorrent · sminakov-tt · Oct 29, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
@@ -255,21 +255,45 @@ uint32_t CreateSemaphore(
 *
 *  | Argument        | Description                             | Type                     | Valid Range | Required |
 *  |-----------------|---------------------------------------- |--------------------------|-------------|----------|
-*  | config          | config for buffer                       | InterleavedBufferConfig  |             | Yes      |
+*  | config          | Config for the buffer                   | InterleavedBufferConfig  |             | Yes      |
 */
 std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config);
 
+/**
+*  Creates a pre-allocated interleaved DRAM or L1 buffer on device
+*
+*  Return value: std::shared_ptr<Buffer>
+*
+*  | Argument        | Description                             | Type                     | Valid Range | Required |
+*  |-----------------|---------------------------------------- |--------------------------|-------------|----------|
+*  | config          | Config for the buffer                   | InterleavedBufferConfig  |             | Yes      |
+*  | address         | Device address of the buffer            | DeviceAddr               |             | Yes      |
+*/
+std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config, DeviceAddr address);
+
 /**
 *  Allocates a sharded DRAM or L1 buffer on device
 *
 *  Return value: std::shared_ptr<Buffer>
 *
 *  | Argument        | Description                             | Type                     | Valid Range | Required |
 *  |-----------------|---------------------------------------- |--------------------------|-------------|----------|
-*  | config          | config for buffer                       | ShardedBufferConfig      |             | Yes      |
+*  | config          | Config for the buffer                   | ShardedBufferConfig      |             | Yes      |
 */
 std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config);
 
+/**
+*  Creates a pre-allocated sharded DRAM or L1 buffer on device
+*
+*  Return value: std::shared_ptr<Buffer>
+*
+*  | Argument        | Description                             | Type                     | Valid Range | Required |
+*  |-----------------|---------------------------------------- |--------------------------|-------------|----------|
+*  | config          | Config for the buffer                   | ShardedBufferConfig      |             | Yes      |
+*  | address         | Device address of the buffer            | DeviceAddr               |             | Yes      |
+*/
+std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config, DeviceAddr address);
+
 /**
 *  Deallocates buffer from device by marking its memory as free.
 *

@@ -203,14 +203,17 @@ Buffer::Buffer(
     const BufferType buffer_type,
     const TensorMemoryLayout buffer_layout,
     const std::optional<ShardSpecBuffer>& shard_parameters,
-    const std::optional<bool> bottom_up) :
+    const std::optional<bool> bottom_up,
+    const bool owns_data,
+    Private) :
     device_(device),
     size_(size),
     page_size_(page_size),
     buffer_type_(buffer_type),
     buffer_layout_(buffer_layout),
     shard_parameters_(shard_parameters),
     bottom_up_(bottom_up.value_or(this->is_dram())),
+    owns_data_(owns_data),
     buffer_page_mapping_(nullptr) {
     TT_FATAL(this->device_ != nullptr && this->device_->allocator_ != nullptr, "Device and allocator need to not be null.");
 
@@ -227,7 +230,8 @@ std::shared_ptr<Buffer> Buffer::create(
     const TensorMemoryLayout buffer_layout,
     const std::optional<ShardSpecBuffer>& shard_parameters,
     const std::optional<bool> bottom_up) {
-    auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up);
+    auto* bufferPtr = new Buffer(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, true /* owns data */, Private());
+    // Using a custom deleter to properly clean up the owned datas
     auto buffer = std::shared_ptr<Buffer>(bufferPtr, deleter);
     buffer->weak_self = buffer;
 
@@ -237,19 +241,50 @@ std::shared_ptr<Buffer> Buffer::create(
     }
 
     buffer->device_->push_work([buffer] {
-        buffer->address_ = detail::AllocateBuffer(buffer.get());
+        try {
+            buffer->address_ = detail::AllocateBuffer(buffer.get());
+        } catch(...) {
+            std::unique_lock lock(buffer->allocation_mutex_);
+            buffer->allocation_status_.store(AllocationStatus::ALLOCATION_FAILED, std::memory_order::relaxed);
+            lock.unlock();
+            buffer->allocation_cv_.notify_all();
+
+            throw;
+        }
 
         std::unique_lock lock(buffer->allocation_mutex_);
-        buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);
+        buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::release);
         lock.unlock();
         buffer->allocation_cv_.notify_all();
     });
 
     return buffer;
 }
 
+std::shared_ptr<Buffer> Buffer::create(
+    Device *device,
+    DeviceAddr address,
+    DeviceAddr size,
+    DeviceAddr page_size,
+    const BufferType buffer_type,
+    const TensorMemoryLayout buffer_layout,
+    const std::optional<ShardSpecBuffer>& shard_parameters,
+    const std::optional<bool> bottom_up) {
+    // Not using a custom deleter, because it doesn't own any data to cleanup
+    auto buffer = std::make_shared<Buffer>(device, size, page_size, buffer_type, buffer_layout, shard_parameters, bottom_up, false /* owns data */, Private());
+    buffer->weak_self = buffer;
+
+    buffer->address_ = address;
+    buffer->allocation_status_.store(AllocationStatus::ALLOCATED, std::memory_order::relaxed);
+
+    return buffer;
+}
+
 void Buffer::deallocate() {
     deallocation_requested_.store(true, std::memory_order::relaxed);
+    if (!owns_data_) {
+        return;
+    }
     device_->push_work([self = weak_self.lock()] {
         self->deallocate_impl();
     });
@@ -263,7 +298,7 @@ void Buffer::deleter(Buffer* buffer) {
 }
 
 void Buffer::deallocate_impl() {
-    if (allocation_status_.load(std::memory_order::relaxed) == AllocationStatus::DEALLOCATED) {
+    if (allocation_status_.load(std::memory_order::relaxed) != AllocationStatus::ALLOCATED) {
         return;
     }
 
@@ -289,6 +324,10 @@ bool Buffer::is_allocated() const {
 }
 
 uint32_t Buffer::address() const {
+    if (allocation_status_.load(std::memory_order::acquire) != AllocationStatus::ALLOCATION_REQUESTED) {
+        return address_;
+    }
+
     if (device_->can_use_passthrough_scheduling()) {
         return address_;
     }

@@ -144,6 +144,8 @@ struct BufferPageMapping {
 inline namespace v0 {
 
 class Buffer final {
+    struct Private { explicit Private() = default; };
+
    public:
     static std::shared_ptr<Buffer> create(
         Device *device,
@@ -153,6 +155,15 @@ class Buffer final {
         TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED,
         const std::optional<ShardSpecBuffer>& shard_parameter = std::nullopt,
         std::optional<bool> bottom_up = std::nullopt);
+    static std::shared_ptr<Buffer> create(
+        Device *device,
+        DeviceAddr address,
+        DeviceAddr size,
+        DeviceAddr page_size,
+        BufferType buffer_type,
+        TensorMemoryLayout buffer_layout = TensorMemoryLayout::INTERLEAVED,
+        const std::optional<ShardSpecBuffer>& shard_parameter = std::nullopt,
+        std::optional<bool> bottom_up = std::nullopt);
 
     Buffer(const Buffer &other) = delete;
     Buffer &operator=(const Buffer &other) = delete;
@@ -210,18 +221,22 @@ class Buffer final {
 
     const std::shared_ptr<const BufferPageMapping>& get_buffer_page_mapping();
 
-   private:
+
     Buffer(
         Device *device,
         DeviceAddr size,
         DeviceAddr page_size,
         BufferType buffer_type,
         TensorMemoryLayout buffer_layout,
         const std::optional<ShardSpecBuffer>& shard_parameter,
-        std::optional<bool> bottom_up);
+        std::optional<bool> bottom_up,
+        bool owns_data,
+        Private);
 
+   private:
     enum class AllocationStatus : uint8_t {
         ALLOCATION_REQUESTED,
+        ALLOCATION_FAILED,
         ALLOCATED,
         DEALLOCATED,
     };
@@ -239,6 +254,7 @@ class Buffer final {
     const BufferType buffer_type_;
     const TensorMemoryLayout buffer_layout_;
     const bool bottom_up_;
+    const bool owns_data_;
 
     std::atomic<AllocationStatus> allocation_status_ = AllocationStatus::ALLOCATION_REQUESTED;
     DeviceAddr address_ = 0;

@@ -1090,6 +1090,11 @@ std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config) {
         config.device, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt);
 }
 
+std::shared_ptr<Buffer> CreateBuffer(const InterleavedBufferConfig &config, DeviceAddr address) {
+    return Buffer::create(
+        config.device, address, config.size, config.page_size, config.buffer_type, config.buffer_layout, std::nullopt, std::nullopt);
+}
+
 std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config) {
     return Buffer::create(
         config.device,
@@ -1101,6 +1106,18 @@ std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config) {
         std::nullopt);
 }
 
+std::shared_ptr<Buffer> CreateBuffer(const ShardedBufferConfig &config, DeviceAddr address) {
+    return Buffer::create(
+        config.device,
+        address,
+        config.size,
+        config.page_size,
+        config.buffer_type,
+        config.buffer_layout,
+        config.shard_parameters,
+        std::nullopt);
+}
+
 void DeallocateBuffer(Buffer &buffer) { buffer.deallocate(); }
 
 void AssignGlobalBufferToProgram(