Skip to content

Commit b2023cb

Browse files
authored
Make sure that the output buffers are in CPU (triton-inference-server#47)
* Add checks for flags values * Use output repsponder for collecting output tensors * Review edits and fix for TYPE_STRING * Review edits
1 parent 18161f9 commit b2023cb

File tree

2 files changed

+46
-47
lines changed

2 files changed

+46
-47
lines changed

src/pb_stub.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,6 @@ namespace triton { namespace backend { namespace python {
6262
// Macros that use current filename and line number.
6363
#define LOG_INFO LOG_INFO_FL(__FILE__, __LINE__)
6464

65-
void
66-
SignalHandler(int signum)
67-
{
68-
// Skip the SIGINT
69-
}
70-
7165
class Logger {
7266
public:
7367
// Log a message.
@@ -110,6 +104,12 @@ class LogMessage {
110104

111105
#define LOG_INFO_FL(FN, LN) LogMessage((char*)(FN), LN).stream()
112106

107+
void
108+
SignalHandler(int signum)
109+
{
110+
// Skip the SIGINT
111+
}
112+
113113
class Stub {
114114
pthread_mutex_t* child_mutex_;
115115
pthread_cond_t* child_cond_;

src/python.cc

Lines changed: 40 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include "triton/backend/backend_memory.h"
5252
#include "triton/backend/backend_model.h"
5353
#include "triton/backend/backend_model_instance.h"
54+
#include "triton/backend/backend_output_responder.h"
5455
#include "triton/common/triton_json.h"
5556
#include "triton/core/tritonbackend.h"
5657
#include "triton/core/tritonserver.h"
@@ -369,7 +370,7 @@ ModelInstanceState::ProcessRequests(
369370
requests_shm_offset));
370371
request_batch->requests = requests_shm_offset;
371372

372-
// We take the responsiblity of the responses.
373+
// We take the responsibilty of the responses.
373374
std::vector<TRITONBACKEND_Response*> responses;
374375
responses.reserve(request_count);
375376

@@ -545,6 +546,7 @@ ModelInstanceState::ProcessRequests(
545546
(char**)&responses_shm, sizeof(Response) * response_batch->batch_size,
546547
response_batch->responses));
547548

549+
548550
for (uint32_t r = 0; r < request_count; ++r) {
549551
TRITONBACKEND_Response* response = responses[r];
550552
TRITONBACKEND_Request* request = requests[r];
@@ -606,14 +608,20 @@ ModelInstanceState::ProcessRequests(
606608
(char**)&output_tensors, sizeof(Tensor) * requested_output_count,
607609
response_shm->outputs));
608610

611+
bool cuda_copy = false;
612+
std::set<std::string> requested_output_names;
609613
for (size_t j = 0; j < requested_output_count; ++j) {
610-
Tensor* output_tensor = &output_tensors[j];
614+
const char* output_name;
615+
GUARDED_RESPOND_IF_ERROR(
616+
responses, r,
617+
TRITONBACKEND_RequestOutputName(request, j, &output_name));
618+
requested_output_names.insert(output_name);
619+
}
611620

612-
TRITONBACKEND_Output* triton_output;
621+
for (size_t j = 0; j < requested_output_count; ++j) {
622+
Tensor* output_tensor = &output_tensors[j];
613623
TRITONSERVER_DataType triton_dt = output_tensor->dtype;
614-
615624
size_t dims_count = output_tensor->dims_count;
616-
617625
int64_t* dims;
618626
GUARDED_RESPOND_IF_EXCEPTION(
619627
responses, r,
@@ -626,6 +634,12 @@ ModelInstanceState::ProcessRequests(
626634
responses, r,
627635
LoadStringFromSharedMemory(shm_pool_, output_tensor->name, name));
628636

637+
// Skip the output tensor if it is not in the list of requested outputs
638+
if (requested_output_names.find(std::string(name)) ==
639+
requested_output_names.end()) {
640+
continue;
641+
}
642+
629643
RawData* raw_data;
630644
GUARDED_RESPOND_IF_EXCEPTION(
631645
responses, r,
@@ -638,50 +652,35 @@ ModelInstanceState::ProcessRequests(
638652
shm_pool_->MapOffset(
639653
(char**)&data, raw_data->byte_size, raw_data->memory_ptr));
640654

641-
// Prepare output buffers.
655+
std::vector<int64_t> batch_shape(dims, dims + dims_count);
656+
TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
657+
int64_t actual_memory_type_id = 0;
658+
void* buffer;
659+
660+
TRITONBACKEND_Output* response_output;
642661
GUARDED_RESPOND_IF_ERROR(
643662
responses, r,
644663
TRITONBACKEND_ResponseOutput(
645-
response, &triton_output, name, triton_dt, dims, dims_count));
646-
647-
uint64_t output_byte_size = raw_data->byte_size;
664+
response, &response_output, name, triton_dt, batch_shape.data(),
665+
batch_shape.size()));
648666

649-
void* output_buffer;
650-
651-
TRITONSERVER_MemoryType output_memory_type = TRITONSERVER_MEMORY_CPU;
652-
int64_t output_memory_type_id = 0;
667+
bool cuda_used;
653668
GUARDED_RESPOND_IF_ERROR(
654669
responses, r,
655670
TRITONBACKEND_OutputBuffer(
656-
triton_output, &output_buffer, output_byte_size,
657-
&output_memory_type, &output_memory_type_id));
658-
659-
if ((responses[r] == nullptr) ||
660-
(output_memory_type == TRITONSERVER_MEMORY_GPU)) {
661-
GUARDED_RESPOND_IF_ERROR(
662-
responses, r,
663-
TRITONSERVER_ErrorNew(
664-
TRITONSERVER_ERROR_UNSUPPORTED,
665-
"can't create response in GPU memory."));
666-
TRITONSERVER_LogMessage(
667-
TRITONSERVER_LOG_ERROR, __FILE__, __LINE__,
668-
(std::string("request ") + std::to_string(r) +
669-
": failed to create output buffer in CPU memory.")
670-
.c_str());
671-
continue;
672-
}
673-
674-
// Copy Python output to Triton output buffers
675-
std::copy(data, data + output_byte_size, (char*)output_buffer);
671+
response_output, &buffer, raw_data->byte_size,
672+
&actual_memory_type, &actual_memory_type_id));
673+
CopyBuffer(
674+
"Failed to copy string", TRITONSERVER_MEMORY_CPU /* memory_type */,
675+
0 /* memory_type_id */, actual_memory_type, actual_memory_type_id,
676+
raw_data->byte_size, data, buffer, CudaStream(), &cuda_used);
677+
cuda_copy |= cuda_used;
676678
}
677-
678-
if (responses[r] == nullptr) {
679-
LOG_MESSAGE(
680-
TRITONSERVER_LOG_ERROR, (std::string("Request ") + std::to_string(r) +
681-
": failed to create output response")
682-
.c_str());
683-
continue;
679+
#ifdef TRITON_ENABLE_GPU
680+
if (cuda_copy) {
681+
cudaStreamSynchronize(stream_);
684682
}
683+
#endif // TRITON_ENABLE_GPU
685684

686685
// If error happens at this stage, we can only log it
687686
LOG_IF_ERROR(
@@ -949,7 +948,7 @@ ModelInstanceState::GetInputTensor(
949948
return TRITONSERVER_ErrorNew(
950949
TRITONSERVER_ERROR_UNSUPPORTED,
951950
"Python backend does not support input size larger than 2GBs, consider "
952-
"parititioning your input into multiple inputs.");
951+
"partitioning your input into multiple inputs.");
953952
}
954953

955954
// We need to create a new collector for every request because python backend

0 commit comments

Comments
 (0)