2828
2929#include < future>
3030#include " pb_utils.h"
31+ #include " scoped_defer.h"
3132#include " triton/backend/backend_common.h"
3233#include " triton/core/tritonserver.h"
3334
@@ -72,7 +73,10 @@ ResponseAlloc(
7273 void ** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
7374 int64_t * actual_memory_type_id)
7475{
75- SharedMemoryManager* shm_pool = reinterpret_cast <SharedMemoryManager*>(userp);
76+ std::unique_ptr<SharedMemoryManager> shm_pool (
77+ reinterpret_cast <SharedMemoryManager*>(userp));
78+
79+ ScopedDefer _ ([&shm_pool] { shm_pool.release (); });
7680 *actual_memory_type = preferred_memory_type;
7781 *actual_memory_type_id = preferred_memory_type_id;
7882
@@ -90,30 +94,20 @@ ResponseAlloc(
9094 case TRITONSERVER_MEMORY_CPU_PINNED: {
9195 *actual_memory_type = TRITONSERVER_MEMORY_CPU;
9296 *actual_memory_type_id = 0 ;
93- bi::managed_external_buffer::handle_t tensor_handle;
9497 try {
95- AllocatedSharedMemory<char > memory =
96- shm_pool->Construct <char >(byte_size);
97- *buffer = memory.data_ .get ();
98- tensor_handle = memory.handle_ ;
99-
100- // Release the ownership to avoid deallocation. The buffer
101- // will be deallocated in ResponseRelease function.
102- memory.data_ .release ();
98+ std::unique_ptr<PbMemory> pb_memory = PbMemory::Create (
99+ shm_pool, *actual_memory_type, *actual_memory_type_id, byte_size,
100+ nullptr /* data */ , false /* copy_gpu */ );
101+ *buffer = pb_memory->DataPtr ();
102+ *buffer_userp = reinterpret_cast <void *>(pb_memory.get ());
103+ pb_memory.release ();
103104 }
104105 catch (const PythonBackendException& pb_exception) {
105106 TRITONSERVER_Error* err =
106107 CreateTritonErrorFromException (pb_exception);
107108 return err;
108109 }
109- // Store the buffer offset in the userp; The userp is large enough to
110- // hold the shared memory offset and the address of the Shared memory
111- // manager
112- AllocationInfo* allocation_info = new AllocationInfo;
113- *buffer_userp = allocation_info;
114-
115- allocation_info->handle_ = tensor_handle;
116- allocation_info->shm_manager_ = shm_pool;
110+
117111 } break ;
118112#ifdef TRITON_ENABLE_GPU
119113 case TRITONSERVER_MEMORY_GPU: {
@@ -151,28 +145,12 @@ ResponseRelease(
151145 size_t byte_size, TRITONSERVER_MemoryType memory_type,
152146 int64_t memory_type_id)
153147{
154- switch (memory_type) {
155- case TRITONSERVER_MEMORY_CPU:
156- case TRITONSERVER_MEMORY_CPU_PINNED: {
157- AllocationInfo* allocation_info =
158- reinterpret_cast <AllocationInfo*>(buffer_userp);
159- {
160- // Load the data so that it is deallocated automatically.
161- auto result = allocation_info->shm_manager_ ->Load <char >(
162- allocation_info->handle_ , true /* unsafe */ );
163- }
164-
165- delete allocation_info;
166- } break ;
167- case TRITONSERVER_MEMORY_GPU: {
168- // No action is required for the GPU tensors.
169- } break ;
170- }
171-
172148 return nullptr ; // Success
173149}
174150
175- RequestExecutor::RequestExecutor (TRITONSERVER_Server* server) : server_(server)
151+ RequestExecutor::RequestExecutor (
152+ std::unique_ptr<SharedMemoryManager>& shm_pool, TRITONSERVER_Server* server)
153+ : server_(server), shm_pool_(shm_pool)
176154{
177155 TRITONSERVER_ResponseAllocator* allocator;
178156 THROW_IF_TRITON_ERROR (TRITONSERVER_ResponseAllocatorNew (
@@ -183,7 +161,6 @@ RequestExecutor::RequestExecutor(TRITONSERVER_Server* server) : server_(server)
183161std::unique_ptr<InferResponse>
184162RequestExecutor::Infer (
185163 const std::shared_ptr<InferRequest>& infer_request,
186- const std::unique_ptr<SharedMemoryManager>& shm_pool,
187164 TRITONSERVER_InferenceResponse** triton_response)
188165{
189166 std::unique_ptr<InferResponse> infer_response;
@@ -247,7 +224,7 @@ RequestExecutor::Infer(
247224 std::future<TRITONSERVER_InferenceResponse*> completed = p->get_future ();
248225
249226 THROW_IF_TRITON_ERROR (TRITONSERVER_InferenceRequestSetResponseCallback (
250- irequest, response_allocator_, shm_pool .get (), InferResponseComplete,
227+ irequest, response_allocator_, shm_pool_ .get (), InferResponseComplete,
251228 reinterpret_cast <void *>(p)));
252229
253230 THROW_IF_TRITON_ERROR (TRITONSERVER_ServerInferAsync (
@@ -284,16 +261,21 @@ RequestExecutor::Infer(
284261 // userp is only set for the CPU tensors
285262 if (memory_type != TRITONSERVER_MEMORY_GPU) {
286263 if (byte_size != 0 ) {
287- output_tensors. push_back ( std::make_shared<PbTensor>(
264+ std::shared_ptr<PbTensor> pb_tensor = std::make_shared<PbTensor>(
288265 sname, dims_vector, datatype, memory_type, memory_type_id,
289266 const_cast <void *>(base), byte_size,
290- nullptr /* DLManagedTensor */ ,
291- *(reinterpret_cast <off_t *>(userp))));
267+ nullptr /* DLManagedTensor */ );
268+
269+ // Load the data so that it is deallocated automatically.
270+ std::unique_ptr<PbMemory> pb_memory (
271+ reinterpret_cast <PbMemory*>(userp));
272+ pb_tensor->SetMemory (std::move (pb_memory));
273+ output_tensors.push_back (pb_tensor);
292274 } else {
293275 output_tensors.push_back (std::make_shared<PbTensor>(
294276 sname, dims_vector, datatype, memory_type, memory_type_id,
295277 const_cast <void *>(base), byte_size,
296- nullptr /* DLManagedTensor */ , 0 /* shared memory offest */ ));
278+ nullptr /* DLManagedTensor */ ));
297279 }
298280 } else {
299281 output_tensors.push_back (std::make_shared<PbTensor>(
0 commit comments