@@ -201,64 +201,50 @@ InferResponse::IsLastResponse()
201201}
202202
203203#ifndef  TRITON_PB_STUB
204- std::shared_ptr<TRITONSERVER_Error*> 
204+ void 
205205InferResponse::Send (
206-     TRITONBACKEND_ResponseFactory* response_factory , void * cuda_stream,
206+     TRITONBACKEND_Response* response , void * cuda_stream,
207207    bool & requires_deferred_callback, const  uint32_t  flags,
208208    std::unique_ptr<SharedMemoryManager>& shm_pool,
209+     GPUBuffersHelper& gpu_buffer_helper,
209210    std::vector<std::pair<std::unique_ptr<PbMemory>, void *>>& output_buffers,
210-     const  std::set<std::string>& requested_output_names,
211-     TRITONBACKEND_Response* response)
211+     const  std::set<std::string>& requested_output_names)
212212{
213213  std::shared_ptr<TRITONSERVER_Error*> response_error =
214214      WrapTritonErrorInSharedPtr (nullptr );
215215  std::unique_ptr<ScopedDefer> response_error_handling;
216216  requires_deferred_callback = false ;
217217
218-   //  Should only destruct the response factory whenever a response factory is
219-   //  being created.
220-   bool  destruct_response_factor = (response == nullptr );
221- 
222-   if  (response == nullptr ) {
223-     SET_ERROR_AND_RETURN (
224-         response_error,
225-         TRITONBACKEND_ResponseNewFromFactory (&response, response_factory));
226-   }
227- 
228218  //  This lambda expression will be called when this function exits, if the
229219  //  inference response doesn't have any GPU tensors. Otherwise, it will be
230220  //  called when the object is destructed or DeferredSendCallback is called.
231-   response_error_handling = std::make_unique<ScopedDefer>(
232-       [response, response_error, flags, response_factory,
233-        destruct_response_factor] {
221+   response_error_handling =
222+       std::make_unique<ScopedDefer>([response, response_error, flags] {
234223        if  (response != nullptr ) {
235224          LOG_IF_ERROR (
236225              TRITONBACKEND_ResponseSend (response, flags, *response_error),
237226              " failed to send the response."  );
238-           if  (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL &&
239-               destruct_response_factor) {
240-             std::unique_ptr<
241-                 TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter>
242-             response_factory_ptr (
243-                 reinterpret_cast <TRITONBACKEND_ResponseFactory*>(
244-                     response_factory));
245-           }
246227        }
247228      });
248229
249230  //  Moves the response sending callback so that it is not called until the stub
250231  //  process fills in the GPU buffers.
251-   ScopedDefer deferred_task (
252-       [this , &requires_deferred_callback, &response_error_handling] {
253-         if  (requires_deferred_callback) {
254-           deferred_send_callback_ = std::move (response_error_handling);
255-         }
256-       });
232+   ScopedDefer deferred_task ([this , &requires_deferred_callback,
233+                              &response_error_handling, &gpu_buffer_helper,
234+                              response_error, &shm_pool] {
235+     if  (*response_error != nullptr ) {
236+       gpu_buffer_helper.SetError (
237+           shm_pool, TRITONSERVER_ErrorMessage (*response_error));
238+     }
239+     if  (requires_deferred_callback) {
240+       deferred_send_callback_ = std::move (response_error_handling);
241+     }
242+   });
257243
258244  if  (HasError ()) {
259245    *response_error = TRITONSERVER_ErrorNew (
260246        TRITONSERVER_ERROR_INTERNAL, Error ()->Message ().c_str ());
261-     return   nullptr ;
247+     return ;
262248  }
263249
264250  bool  cuda_copy = false ;
@@ -322,6 +308,7 @@ InferResponse::Send(
322308                output_tensor->ByteSize (), reinterpret_cast <char *>(buffer),
323309                true  /*  copy_gpu */  ));
324310      }
311+       gpu_buffer_helper.AddBuffer (output_buffer->ShmHandle ());
325312      output_buffers.push_back ({std::move (output_buffer), buffer});
326313#endif 
327314    }
@@ -336,6 +323,7 @@ InferResponse::Send(
336323              shm_pool, actual_memory_type, actual_memory_type_id,
337324              output_tensor->ByteSize (), nullptr  /*  data ptr */  ));
338325
326+       gpu_buffer_helper.AddBuffer (output_buffer->ShmHandle ());
339327      output_buffers.push_back ({std::move (output_buffer), buffer});
340328    }
341329
@@ -357,8 +345,6 @@ InferResponse::Send(
357345    cudaStreamSynchronize (reinterpret_cast <cudaStream_t>(cuda_stream));
358346  }
359347#endif   //  TRITON_ENABLE_GPU
360- 
361-   return  response_error;
362348}
363349#endif 
364350
0 commit comments