dolanp83
diff --git a/‎src/pb_stub.cc‎
Lines changed: 7 additions & 1 deletion b/‎src/pb_stub.cc‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/pb_tensor.cc‎
Lines changed: 2 additions & 4 deletions b/‎src/pb_tensor.cc‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎src/pb_tensor.h‎
Lines changed: 1 addition & 1 deletion b/‎src/pb_tensor.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/python.cc‎
Lines changed: 22 additions & 10 deletions b/‎src/python.cc‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎src/request_executor.cc‎
Lines changed: 12 additions & 0 deletions b/‎src/request_executor.cc‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/response_sender.cc‎
Lines changed: 11 additions & 0 deletions b/‎src/response_sender.cc‎
Lines changed: 11 additions & 0 deletions
@@ -858,7 +858,13 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
   py::class_<PbTensor, std::shared_ptr<PbTensor>>(module, "Tensor")
       .def(py::init(&PbTensor::FromNumpy))
       .def("name", &PbTensor::Name)
-      .def("as_numpy", &PbTensor::AsNumpy)
+      // The reference_internal is added to make sure that the NumPy object has
+      // the same lifetime as the tensor object. This means even when the NumPy
+      // object is only in scope, the tensor object is not deallocated from
+      // shared memory to make sure the NumPy object is still valid.
+      .def(
+          "as_numpy", &PbTensor::AsNumpy,
+          py::return_value_policy::reference_internal)
       .def("triton_dtype", &PbTensor::TritonDtype)
       .def("to_dlpack", &PbTensor::ToDLPack)
       .def("is_cpu", &PbTensor::IsCPU)
 
@@ -395,17 +395,15 @@ PbTensor::Name() const
 }
 
 #ifdef TRITON_PB_STUB
-const py::array&
+const py::array*
 PbTensor::AsNumpy() const
 {
   if (IsCPU()) {
-    return numpy_array_;
+    return &numpy_array_;
   } else {
     throw PythonBackendException(
         "Tensor is stored in GPU and cannot be converted to NumPy.");
   }
-
-  return numpy_array_;
 }
 #endif  // TRITON_PB_STUB
 
 
@@ -159,7 +159,7 @@ class PbTensor {
   /// Get NumPy representation of the tensor.
   /// \throw If the tensor is stored in GPU, an exception is thrown
   /// \return NumPy representation of the Tensor
-  const py::array& AsNumpy() const;
+  const py::array* AsNumpy() const;
 #endif
 
   /// Save tensor inside shared memory.
 
@@ -1097,6 +1097,13 @@ ModelInstanceState::GetInputTensor(
 
   ModelState* model_state = reinterpret_cast<ModelState*>(Model());
   bool cpu_only_tensors = model_state->ForceCPUOnlyInputTensors();
+
+  if (!cpu_only_tensors && model_state->IsDecoupled()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "FORCE_CPU_ONLY_INPUT_TENSORS set to OFF is not yet supported in the "
+        "decoupled API.");
+  }
   if (input_dtype == TRITONSERVER_TYPE_BYTES) {
     cpu_only_tensors = true;
   }
@@ -2443,19 +2450,13 @@ TRITONBACKEND_ModelInstanceExecute(
         requests, request_count, infer_requests);
 
     if (error != nullptr) {
-      for (auto& infer_request : infer_requests) {
-        // We should only delete the response factory for the requests that have
-        // not been closed.
+      for (uint32_t r = 0; r < request_count; ++r) {
+        TRITONBACKEND_Request* request = requests[r];
         if (!instance_state->ExistsInClosedRequests(
-                infer_request->RequestAddress())) {
-          LOG_IF_ERROR(
-              infer_request->DeleteResponseFactory(),
-              "Failed to delete the response factory.");
+                reinterpret_cast<intptr_t>(request))) {
           TRITONBACKEND_Response* response = nullptr;
           LOG_IF_ERROR(
-              TRITONBACKEND_ResponseNew(
-                  &response, reinterpret_cast<TRITONBACKEND_Request*>(
-                                 infer_request->RequestAddress())),
+              TRITONBACKEND_ResponseNew(&response, request),
               "Failed to create a new resposne.");
 
           if (response != nullptr) {
@@ -2466,6 +2467,17 @@ TRITONBACKEND_ModelInstanceExecute(
           }
         }
       }
+
+      // We should only delete the response factory for the requests that have
+      // not been closed.
+      for (auto& infer_request : infer_requests) {
+        if (!instance_state->ExistsInClosedRequests(
+                infer_request->RequestAddress())) {
+          LOG_IF_ERROR(
+              infer_request->DeleteResponseFactory(),
+              "Failed to delete the response factory.");
+        }
+      }
     }
   }
 
 
@@ -186,6 +186,18 @@ RequestExecutor::Infer(
               .c_str());
     }
 
+    uint32_t txn_flags;
+    THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelTransactionProperties(
+        server_, model_name, model_version, &txn_flags, nullptr /* voidp */));
+
+    // Decoupled API is not supported in the current BLS interface
+    if ((txn_flags & TRITONSERVER_TXN_DECOUPLED) != 0) {
+      throw PythonBackendException(
+          std::string("Model ") + model_name +
+          " is using the decoupled. BLS doesn't support models using the "
+          "decoupled transaction policy.");
+    }
+
     // Inference
     THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestNew(
         &irequest, server_, model_name, model_version));
 
@@ -67,6 +67,17 @@ ResponseSender::Send(
         "set to zero.");
   }
 
+  if (infer_response) {
+    for (auto& tensor : infer_response->OutputTensors()) {
+      if (!tensor->IsCPU()) {
+        throw PythonBackendException(
+            "Tensor '" + tensor->Name() +
+            "' is stored in GPU. GPU tensors are not supported yet in the "
+            "decoupled response sender.");
+      }
+    }
+  }
+
   std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
 
   AllocatedSharedMemory<ResponseSendMessage> response_send_message =
Original file line number	Diff line number	Diff line change
`@@ -395,17 +395,15 @@ PbTensor::Name() const`
`395`	`395`	`}`
`396`	`396`
`397`	`397`	`#ifdef TRITON_PB_STUB`
`398`		`-const py::array&`
	`398`	`+const py::array*`
`399`	`399`	`PbTensor::AsNumpy() const`
`400`	`400`	`{`
`401`	`401`	`if (IsCPU()) {`
`402`		`- return numpy_array_;`
	`402`	`+ return &numpy_array_;`
`403`	`403`	`} else {`
`404`	`404`	`throw PythonBackendException(`
`405`	`405`	`"Tensor is stored in GPU and cannot be converted to NumPy.");`
`406`	`406`	`}`
`407`		`-`
`408`		`- return numpy_array_;`
`409`	`407`	`}`
`410`	`408`	`#endif // TRITON_PB_STUB`
`411`	`409`