Skip to content

Commit b62f9f7

Browse files
authored
Improve lifetime of NumPy objects (triton-inference-server#149)
* Improve lifetime of NumPy objects * Fix the leak * Add error for using decoupled API in BLS
1 parent 84a6405 commit b62f9f7

File tree

6 files changed

+55
-16
lines changed

6 files changed

+55
-16
lines changed

src/pb_stub.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -858,7 +858,13 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
858858
py::class_<PbTensor, std::shared_ptr<PbTensor>>(module, "Tensor")
859859
.def(py::init(&PbTensor::FromNumpy))
860860
.def("name", &PbTensor::Name)
861-
.def("as_numpy", &PbTensor::AsNumpy)
861+
// The reference_internal is added to make sure that the NumPy object has
862+
// the same lifetime as the tensor object. This means even when the NumPy
863+
// object is only in scope, the tensor object is not deallocated from
864+
// shared memory to make sure the NumPy object is still valid.
865+
.def(
866+
"as_numpy", &PbTensor::AsNumpy,
867+
py::return_value_policy::reference_internal)
862868
.def("triton_dtype", &PbTensor::TritonDtype)
863869
.def("to_dlpack", &PbTensor::ToDLPack)
864870
.def("is_cpu", &PbTensor::IsCPU)

src/pb_tensor.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -395,17 +395,15 @@ PbTensor::Name() const
395395
}
396396

397397
#ifdef TRITON_PB_STUB
398-
const py::array&
398+
const py::array*
399399
PbTensor::AsNumpy() const
400400
{
401401
if (IsCPU()) {
402-
return numpy_array_;
402+
return &numpy_array_;
403403
} else {
404404
throw PythonBackendException(
405405
"Tensor is stored in GPU and cannot be converted to NumPy.");
406406
}
407-
408-
return numpy_array_;
409407
}
410408
#endif // TRITON_PB_STUB
411409

src/pb_tensor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ class PbTensor {
159159
/// Get NumPy representation of the tensor.
160160
/// \throw If the tensor is stored in GPU, an exception is thrown
161161
/// \return NumPy representation of the Tensor
162-
const py::array& AsNumpy() const;
162+
const py::array* AsNumpy() const;
163163
#endif
164164

165165
/// Save tensor inside shared memory.

src/python.cc

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1097,6 +1097,13 @@ ModelInstanceState::GetInputTensor(
10971097

10981098
ModelState* model_state = reinterpret_cast<ModelState*>(Model());
10991099
bool cpu_only_tensors = model_state->ForceCPUOnlyInputTensors();
1100+
1101+
if (!cpu_only_tensors && model_state->IsDecoupled()) {
1102+
return TRITONSERVER_ErrorNew(
1103+
TRITONSERVER_ERROR_INTERNAL,
1104+
"FORCE_CPU_ONLY_INPUT_TENSORS set to OFF is not yet supported in the "
1105+
"decoupled API.");
1106+
}
11001107
if (input_dtype == TRITONSERVER_TYPE_BYTES) {
11011108
cpu_only_tensors = true;
11021109
}
@@ -2443,19 +2450,13 @@ TRITONBACKEND_ModelInstanceExecute(
24432450
requests, request_count, infer_requests);
24442451

24452452
if (error != nullptr) {
2446-
for (auto& infer_request : infer_requests) {
2447-
// We should only delete the response factory for the requests that have
2448-
// not been closed.
2453+
for (uint32_t r = 0; r < request_count; ++r) {
2454+
TRITONBACKEND_Request* request = requests[r];
24492455
if (!instance_state->ExistsInClosedRequests(
2450-
infer_request->RequestAddress())) {
2451-
LOG_IF_ERROR(
2452-
infer_request->DeleteResponseFactory(),
2453-
"Failed to delete the response factory.");
2456+
reinterpret_cast<intptr_t>(request))) {
24542457
TRITONBACKEND_Response* response = nullptr;
24552458
LOG_IF_ERROR(
2456-
TRITONBACKEND_ResponseNew(
2457-
&response, reinterpret_cast<TRITONBACKEND_Request*>(
2458-
infer_request->RequestAddress())),
2459+
TRITONBACKEND_ResponseNew(&response, request),
24592460
"Failed to create a new resposne.");
24602461

24612462
if (response != nullptr) {
@@ -2466,6 +2467,17 @@ TRITONBACKEND_ModelInstanceExecute(
24662467
}
24672468
}
24682469
}
2470+
2471+
// We should only delete the response factory for the requests that have
2472+
// not been closed.
2473+
for (auto& infer_request : infer_requests) {
2474+
if (!instance_state->ExistsInClosedRequests(
2475+
infer_request->RequestAddress())) {
2476+
LOG_IF_ERROR(
2477+
infer_request->DeleteResponseFactory(),
2478+
"Failed to delete the response factory.");
2479+
}
2480+
}
24692481
}
24702482
}
24712483

src/request_executor.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,18 @@ RequestExecutor::Infer(
186186
.c_str());
187187
}
188188

189+
uint32_t txn_flags;
190+
THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelTransactionProperties(
191+
server_, model_name, model_version, &txn_flags, nullptr /* voidp */));
192+
193+
// Decoupled API is not supported in the current BLS interface
194+
if ((txn_flags & TRITONSERVER_TXN_DECOUPLED) != 0) {
195+
throw PythonBackendException(
196+
std::string("Model ") + model_name +
197+
" is using the decoupled. BLS doesn't support models using the "
198+
"decoupled transaction policy.");
199+
}
200+
189201
// Inference
190202
THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestNew(
191203
&irequest, server_, model_name, model_version));

src/response_sender.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,17 @@ ResponseSender::Send(
6767
"set to zero.");
6868
}
6969

70+
if (infer_response) {
71+
for (auto& tensor : infer_response->OutputTensors()) {
72+
if (!tensor->IsCPU()) {
73+
throw PythonBackendException(
74+
"Tensor '" + tensor->Name() +
75+
"' is stored in GPU. GPU tensors are not supported yet in the "
76+
"decoupled response sender.");
77+
}
78+
}
79+
}
80+
7081
std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
7182

7283
AllocatedSharedMemory<ResponseSendMessage> response_send_message =

0 commit comments

Comments
 (0)