diff --git a/include/triton/backend/backend_output_responder.h b/include/triton/backend/backend_output_responder.h index f1c7986..a507b8f 100644 --- a/include/triton/backend/backend_output_responder.h +++ b/include/triton/backend/backend_output_responder.h @@ -52,13 +52,31 @@ class BackendOutputResponder { // The caller can optionally provide 'event' for internal synchronization // instead of using 'stream'. explicit BackendOutputResponder( - TRITONBACKEND_Request** requests, const uint32_t request_count, + TRITONBACKEND_Request** requests, std::vector* responses, TRITONBACKEND_MemoryManager* memory_manager, const bool first_dim_batching, const bool pinned_enabled, cudaStream_t stream, cudaEvent_t event = nullptr, bool copy_on_stream = false) - : need_sync_(false), requests_(requests), request_count_(request_count), + : need_sync_(false), requests_(requests), + responses_(responses), memory_manager_(memory_manager), + first_dim_batching_(first_dim_batching), + pinned_enabled_(pinned_enabled), + use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1), + stream_(stream), event_(event), pending_pinned_byte_size_(0), + copy_on_stream_(copy_on_stream) + { + } + + // Legacy constructor for backwards compatibility with request_count parameter + explicit BackendOutputResponder( + TRITONBACKEND_Request** requests, const uint32_t /* request_count */, + std::vector* responses, + TRITONBACKEND_MemoryManager* memory_manager, + const bool first_dim_batching, const bool pinned_enabled, + cudaStream_t stream, cudaEvent_t event = nullptr, + bool copy_on_stream = false) + : need_sync_(false), requests_(requests), responses_(responses), memory_manager_(memory_manager), first_dim_batching_(first_dim_batching), pinned_enabled_(pinned_enabled), @@ -75,12 +93,12 @@ class BackendOutputResponder { // max_batch_size value instead of having it provided directly as in // the above constructor. explicit BackendOutputResponder( - TRITONBACKEND_Request** requests, const uint32_t request_count, + TRITONBACKEND_Request** requests, const uint32_t /* request_count */, std::vector* responses, const int max_batch_size, TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled, cudaStream_t stream, cudaEvent_t event = nullptr, bool copy_on_stream = false) - : need_sync_(false), requests_(requests), request_count_(request_count), + : need_sync_(false), requests_(requests), responses_(responses), memory_manager_(memory_manager), first_dim_batching_(max_batch_size >= 1), pinned_enabled_(pinned_enabled), @@ -152,7 +170,6 @@ class BackendOutputResponder { bool need_sync_; TRITONBACKEND_Request** requests_; - const uint32_t request_count_; std::vector* responses_; TRITONBACKEND_MemoryManager* memory_manager_; const bool first_dim_batching_;