5151#include  " triton/backend/backend_memory.h" 
5252#include  " triton/backend/backend_model.h" 
5353#include  " triton/backend/backend_model_instance.h" 
54+ #include  " triton/backend/backend_output_responder.h" 
5455#include  " triton/common/triton_json.h" 
5556#include  " triton/core/tritonbackend.h" 
5657#include  " triton/core/tritonserver.h" 
@@ -369,7 +370,7 @@ ModelInstanceState::ProcessRequests(
369370      requests_shm_offset));
370371  request_batch->requests  = requests_shm_offset;
371372
372-   //  We take the responsiblity  of the responses.
373+   //  We take the responsibilty  of the responses.
373374  std::vector<TRITONBACKEND_Response*> responses;
374375  responses.reserve (request_count);
375376
@@ -545,6 +546,7 @@ ModelInstanceState::ProcessRequests(
545546          (char **)&responses_shm, sizeof (Response) * response_batch->batch_size ,
546547          response_batch->responses ));
547548
549+ 
548550  for  (uint32_t  r = 0 ; r < request_count; ++r) {
549551    TRITONBACKEND_Response* response = responses[r];
550552    TRITONBACKEND_Request* request = requests[r];
@@ -606,14 +608,20 @@ ModelInstanceState::ProcessRequests(
606608            (char **)&output_tensors, sizeof (Tensor) * requested_output_count,
607609            response_shm->outputs ));
608610
611+     bool  cuda_copy = false ;
612+     std::set<std::string> requested_output_names;
609613    for  (size_t  j = 0 ; j < requested_output_count; ++j) {
610-       Tensor* output_tensor = &output_tensors[j];
614+       const  char * output_name;
615+       GUARDED_RESPOND_IF_ERROR (
616+           responses, r,
617+           TRITONBACKEND_RequestOutputName (request, j, &output_name));
618+       requested_output_names.insert (output_name);
619+     }
611620
612-       TRITONBACKEND_Output* triton_output;
621+     for  (size_t  j = 0 ; j < requested_output_count; ++j) {
622+       Tensor* output_tensor = &output_tensors[j];
613623      TRITONSERVER_DataType triton_dt = output_tensor->dtype ;
614- 
615624      size_t  dims_count = output_tensor->dims_count ;
616- 
617625      int64_t * dims;
618626      GUARDED_RESPOND_IF_EXCEPTION (
619627          responses, r,
@@ -626,6 +634,12 @@ ModelInstanceState::ProcessRequests(
626634          responses, r,
627635          LoadStringFromSharedMemory (shm_pool_, output_tensor->name , name));
628636
637+       //  Skip the output tensor if it is not in the list of requested outputs
638+       if  (requested_output_names.find (std::string (name)) ==
639+           requested_output_names.end ()) {
640+         continue ;
641+       }
642+ 
629643      RawData* raw_data;
630644      GUARDED_RESPOND_IF_EXCEPTION (
631645          responses, r,
@@ -638,50 +652,35 @@ ModelInstanceState::ProcessRequests(
638652          shm_pool_->MapOffset (
639653              (char **)&data, raw_data->byte_size , raw_data->memory_ptr ));
640654
641-       //  Prepare output buffers.
655+       std::vector<int64_t > batch_shape (dims, dims + dims_count);
656+       TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
657+       int64_t  actual_memory_type_id = 0 ;
658+       void * buffer;
659+ 
660+       TRITONBACKEND_Output* response_output;
642661      GUARDED_RESPOND_IF_ERROR (
643662          responses, r,
644663          TRITONBACKEND_ResponseOutput (
645-               response, &triton_output, name, triton_dt, dims, dims_count));
646- 
647-       uint64_t  output_byte_size = raw_data->byte_size ;
664+               response, &response_output, name, triton_dt, batch_shape.data (),
665+               batch_shape.size ()));
648666
649-       void * output_buffer;
650- 
651-       TRITONSERVER_MemoryType output_memory_type = TRITONSERVER_MEMORY_CPU;
652-       int64_t  output_memory_type_id = 0 ;
667+       bool  cuda_used;
653668      GUARDED_RESPOND_IF_ERROR (
654669          responses, r,
655670          TRITONBACKEND_OutputBuffer (
656-               triton_output, &output_buffer, output_byte_size,
657-               &output_memory_type, &output_memory_type_id));
658- 
659-       if  ((responses[r] == nullptr ) ||
660-           (output_memory_type == TRITONSERVER_MEMORY_GPU)) {
661-         GUARDED_RESPOND_IF_ERROR (
662-             responses, r,
663-             TRITONSERVER_ErrorNew (
664-                 TRITONSERVER_ERROR_UNSUPPORTED,
665-                 " can't create response in GPU memory." 
666-         TRITONSERVER_LogMessage (
667-             TRITONSERVER_LOG_ERROR, __FILE__, __LINE__,
668-             (std::string (" request " std::to_string (r) +
669-              " : failed to create output buffer in CPU memory." 
670-                 .c_str ());
671-         continue ;
672-       }
673- 
674-       //  Copy Python output to Triton output buffers
675-       std::copy (data, data + output_byte_size, (char *)output_buffer);
671+               response_output, &buffer, raw_data->byte_size ,
672+               &actual_memory_type, &actual_memory_type_id));
673+       CopyBuffer (
674+           " Failed to copy string" /*  memory_type */ 
675+           0  /*  memory_type_id */ 
676+           raw_data->byte_size , data, buffer, CudaStream (), &cuda_used);
677+       cuda_copy |= cuda_used;
676678    }
677- 
678-     if  (responses[r] == nullptr ) {
679-       LOG_MESSAGE (
680-           TRITONSERVER_LOG_ERROR, (std::string (" Request " std::to_string (r) +
681-                                    " : failed to create output response" 
682-                                       .c_str ());
683-       continue ;
679+ #ifdef  TRITON_ENABLE_GPU
680+     if  (cuda_copy) {
681+       cudaStreamSynchronize (stream_);
684682    }
683+ #endif   //  TRITON_ENABLE_GPU
685684
686685    //  If error happens at this stage, we can only log it
687686    LOG_IF_ERROR (
@@ -949,7 +948,7 @@ ModelInstanceState::GetInputTensor(
949948    return  TRITONSERVER_ErrorNew (
950949        TRITONSERVER_ERROR_UNSUPPORTED,
951950        " Python backend does not support input size larger than 2GBs, consider " 
952-         " parititioning  your input into multiple inputs." 
951+         " partitioning  your input into multiple inputs." 
953952  }
954953
955954  //  We need to create a new collector for every request because python backend
0 commit comments