From ffbac67072c903210440b552333a2b8346de17db Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Mon, 4 Dec 2023 15:29:44 -0800 Subject: [PATCH 01/54] BLS Timeout Fix (#315) * Pass request timeout and increase size of timeout variable --- src/infer_request.cc | 4 ++-- src/infer_request.h | 8 ++++---- src/pb_stub.cc | 2 +- src/python_be.cc | 8 ++++++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index d641526e..da2a6b6c 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -42,7 +42,7 @@ InferRequest::InferRequest( const std::vector>& inputs, const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, - const std::string& parameters, const uint32_t flags, const int32_t timeout, + const std::string& parameters, const uint32_t flags, const uint64_t timeout, const intptr_t response_factory_address, const intptr_t request_address, const PreferredMemory& preferred_memory, const InferenceTrace& trace) : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs), @@ -145,7 +145,7 @@ InferRequest::ShmHandle() return shm_handle_; } -int32_t +uint64_t InferRequest::Timeout() { return timeout_; diff --git a/src/infer_request.h b/src/infer_request.h index 3d81c5d2..38850c61 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -70,7 +70,7 @@ struct InferRequestShm { intptr_t address; intptr_t response_factory_address; bool is_decoupled; - int32_t timeout; + uint64_t timeout; PreferredMemory preferred_memory; InferenceTrace trace; uint32_t request_release_flags; @@ -84,7 +84,7 @@ class InferRequest { const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, const std::string& parameters, const uint32_t flags = 0, - const int32_t timeout = 0, const intptr_t response_factory_address = 0, + const uint64_t timeout = 0, const intptr_t response_factory_address = 0, const intptr_t request_address = 0, const PreferredMemory& preferred_memory = PreferredMemory(PreferredMemory::DEFAULT, 0), @@ -100,7 +100,7 @@ class InferRequest { void SetFlags(uint32_t flags); const std::set& RequestedOutputNames(); bi::managed_external_buffer::handle_t ShmHandle(); - int32_t Timeout(); + uint64_t Timeout(); bool IsDecoupled(); void SetIsDecoupled(const bool is_decoupled); PreferredMemory& GetPreferredMemory(); @@ -158,7 +158,7 @@ class InferRequest { int64_t model_version_; std::string parameters_; uint32_t flags_; - int32_t timeout_; + uint64_t timeout_; intptr_t response_factory_address_; intptr_t request_address_; bool is_decoupled_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 3d473101..4c5e9ae7 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1581,7 +1581,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) const std::vector& requested_output_names, const std::string& model_name, const int64_t model_version, const uint32_t flags, - const int32_t timeout, + const uint64_t timeout, const PreferredMemory& preferred_memory, const InferenceTrace& trace, const py::object& parameters_) { diff --git a/src/python_be.cc b/src/python_be.cc index cec2d18a..ccdae3e4 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -372,6 +372,10 @@ ModelInstanceState::SaveRequestsToSharedMemory( } InferenceTrace trace = InferenceTrace(triton_trace); + uint64_t request_timeout; + RETURN_IF_ERROR(TRITONBACKEND_InferenceRequestTimeoutMicroseconds( + request, &request_timeout)); + std::unique_ptr infer_request; if (model_state->IsDecoupled()) { TRITONBACKEND_ResponseFactory* factory_ptr; @@ -394,14 +398,14 @@ ModelInstanceState::SaveRequestsToSharedMemory( infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, - 0 /* BLS request timeout*/, reinterpret_cast(factory_ptr), + request_timeout, reinterpret_cast(factory_ptr), reinterpret_cast(request), PreferredMemory(PreferredMemory::DEFAULT, 0), trace); } else { infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, - 0 /* BLS request timeout*/, 0 /* response_factory_address */, + request_timeout, 0 /* response_factory_address */, reinterpret_cast(request), PreferredMemory(PreferredMemory::DEFAULT, 0), trace); } From 8b0fa4cc5daa4b1891cdc5b0b42079dbe2a60eae Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Fri, 8 Dec 2023 14:00:03 -0800 Subject: [PATCH 02/54] Fix BLS decoupled segfault and hang (#325) * Store InferPayload using the address of the object managed by the shared_ptr * Fix hang * Release GIL before sending message to the other process * Release GIL in the beginning --- src/infer_request.cc | 8 +++++++- src/python_be.cc | 4 ++-- src/python_be.h | 2 +- src/response_sender.cc | 7 +++++++ 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index da2a6b6c..c21feeaa 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -442,6 +442,13 @@ InferRequest::GetResponseSender() std::shared_ptr InferRequest::Exec(const bool is_decoupled) { + // Release the GIL. This avoids a potential deadlock situation in the parent + // process, where every thread in the thread pool is indirectly waiting for a + // function in the stub process that acquires the GIL. Meanwhile, the current + // thread, which holds the GIL, is also waiting for the parent side to have + // the next available thread to pick up the job during resource contention. + py::gil_scoped_release release; + // BLS should not be used in "initialize" or "finalize" function. std::unique_ptr& stub = Stub::GetOrCreateInstance(); if (!stub->IsInitialized() || stub->IsFinalizing()) { @@ -465,7 +472,6 @@ InferRequest::Exec(const bool is_decoupled) }); try { - py::gil_scoped_release release; ipc_message = IPCMessage::Create(shm_pool, true /* inline_response */); bool has_exception = false; PythonBackendException pb_exception(std::string{}); diff --git a/src/python_be.cc b/src/python_be.cc index ccdae3e4..6de5bcf3 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -752,7 +752,7 @@ ModelInstanceState::ExecuteBLSRequest( if (is_decoupled && (infer_response->Id() != nullptr)) { // Need to manage the lifetime of InferPayload object for bls // decoupled responses. - infer_payload_[reinterpret_cast(&infer_payload)] = + infer_payload_[reinterpret_cast(infer_payload.get())] = infer_payload; } @@ -943,7 +943,7 @@ ModelInstanceState::ProcessBLSCleanupRequest( reinterpret_cast(cleanup_request_message.data_.get()); void* id = cleanup_message_ptr->id; - infer_payload_.erase(id); + infer_payload_.erase(reinterpret_cast(id)); { bi::scoped_lock lock{*(message->ResponseMutex())}; diff --git a/src/python_be.h b/src/python_be.h index 5504e0c9..2fc755ca 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -286,7 +286,7 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr received_message_; std::vector> futures_; std::unique_ptr thread_pool_; - std::unordered_map> infer_payload_; + std::unordered_map> infer_payload_; std::unique_ptr request_executor_; std::mutex response_factory_map_mutex_; std::unordered_map diff --git a/src/response_sender.cc b/src/response_sender.cc index 1e2e9b50..c6b8f788 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -50,6 +50,13 @@ void ResponseSender::Send( std::shared_ptr infer_response, const uint32_t flags) { + // Release the GIL. This avoids a potential deadlock situation in the parent + // process, where every thread in the thread pool is indirectly waiting for a + // function in the stub process that acquires the GIL. Meanwhile, the current + // thread, which holds the GIL, is also waiting for the parent side to have + // the next available thread to pick up the job during resource contention. + py::gil_scoped_release release; + if (closed_) { throw PythonBackendException( "Unable to send response. Response sender has been closed."); From c5f304decda609ab21a004c525436e58dd527190 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 14 Dec 2023 16:03:59 -0800 Subject: [PATCH 03/54] Fix segfault for decoupled models (#327) * Set release flags and clean up response factory map before returning error * Address comments * Move the cleanup function to the outside scope * Delete response factory when response sender goes out of scope --- src/infer_request.cc | 14 -------- src/infer_request.h | 4 --- src/ipc_message.h | 3 +- src/pb_response_iterator.cc | 2 +- src/pb_stub.cc | 18 ++++++---- src/pb_stub.h | 9 +++-- src/python_be.cc | 68 +++++++++---------------------------- src/python_be.h | 6 ++-- src/response_sender.cc | 7 ++++ src/response_sender.h | 1 + 10 files changed, 47 insertions(+), 85 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index c21feeaa..f18900d0 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -405,20 +405,6 @@ InferRequest::InferRequest( #endif } -#ifndef TRITON_PB_STUB -TRITONSERVER_Error* -InferRequest::DeleteResponseFactory() -{ - TRITONBACKEND_ResponseFactory* response_factory = - reinterpret_cast( - response_factory_address_); - TRITONSERVER_Error* error = - TRITONBACKEND_ResponseFactoryDelete(response_factory); - - return error; -} -#endif - #ifdef TRITON_PB_STUB bool InferRequest::IsCancelled() diff --git a/src/infer_request.h b/src/infer_request.h index 38850c61..b8dee87c 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -137,10 +137,6 @@ class InferRequest { intptr_t RequestAddress(); ~InferRequest() {} -#ifndef TRITON_PB_STUB - TRITONSERVER_Error* DeleteResponseFactory(); -#endif - private: InferRequest( AllocatedSharedMemory& infer_request_shm, diff --git a/src/ipc_message.h b/src/ipc_message.h index d720a84d..866070f6 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -54,7 +54,8 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_AutoCompleteRequest, PYTHONSTUB_AutoCompleteResponse, PYTHONSTUB_LogRequest, - PYTHONSTUB_CleanupRequest, + PYTHONSTUB_BLSDecoupledInferPayloadCleanup, + PYTHONSTUB_BLSDecoupledResponseFactoryCleanup, PYTHONSTUB_MetricFamilyRequestNew, PYTHONSTUB_MetricFamilyRequestDelete, PYTHONSTUB_MetricRequestNew, diff --git a/src/pb_response_iterator.cc b/src/pb_response_iterator.cc index 1e0d631a..9abf4997 100644 --- a/src/pb_response_iterator.cc +++ b/src/pb_response_iterator.cc @@ -133,7 +133,7 @@ void ResponseIterator::Clear() { std::unique_ptr& stub = Stub::GetOrCreateInstance(); - stub->EnqueueCleanupId(id_); + stub->EnqueueCleanupId(id_, PYTHONSTUB_BLSDecoupledInferPayloadCleanup); { std::lock_guard lock{mu_}; response_buffer_.push(DUMMY_MESSAGE); diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 4c5e9ae7..53a6c540 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -993,8 +993,12 @@ Stub::ServiceStubToParentRequests() stub_to_parent_buffer_.pop(); if (utils_msg_payload->command_type == PYTHONSTUB_LogRequest) { SendLogMessage(utils_msg_payload); - } else if (utils_msg_payload->command_type == PYTHONSTUB_CleanupRequest) { - SendCleanupId(utils_msg_payload); + } else if ( + (utils_msg_payload->command_type == + PYTHONSTUB_BLSDecoupledInferPayloadCleanup) || + (utils_msg_payload->command_type == + PYTHONSTUB_BLSDecoupledResponseFactoryCleanup)) { + SendCleanupId(utils_msg_payload, utils_msg_payload->command_type); } else if ( utils_msg_payload->command_type == PYTHONSTUB_IsRequestCancelled) { SendIsCancelled(utils_msg_payload); @@ -1040,7 +1044,9 @@ Stub::SendLogMessage(std::unique_ptr& utils_msg_payload) } void -Stub::SendCleanupId(std::unique_ptr& utils_msg_payload) +Stub::SendCleanupId( + std::unique_ptr& utils_msg_payload, + const PYTHONSTUB_CommandType& command_type) { void* id = utils_msg_payload->utils_message_ptr; { @@ -1050,7 +1056,7 @@ Stub::SendCleanupId(std::unique_ptr& utils_msg_payload) std::unique_ptr ipc_message = IPCMessage::Create(shm_pool_, true /* inline_response */); - ipc_message->Command() = PYTHONSTUB_CleanupRequest; + ipc_message->Command() = command_type; AllocatedSharedMemory cleanup_request_message = shm_pool_->Construct( sizeof(CleanupMessage) + @@ -1072,11 +1078,11 @@ Stub::SendCleanupId(std::unique_ptr& utils_msg_payload) } void -Stub::EnqueueCleanupId(void* id) +Stub::EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type) { if (id != nullptr) { std::unique_ptr utils_msg_payload = - std::make_unique(PYTHONSTUB_CleanupRequest, id); + std::make_unique(command_type, id); EnqueueUtilsMessage(std::move(utils_msg_payload)); } } diff --git a/src/pb_stub.h b/src/pb_stub.h index 12b47abc..74a66b95 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -315,10 +315,13 @@ class Stub { std::shared_ptr infer_response); /// Send the id to the python backend for object cleanup - void SendCleanupId(std::unique_ptr& utils_msg_payload); + void SendCleanupId( + std::unique_ptr& utils_msg_payload, + const PYTHONSTUB_CommandType& command_type); - /// Add cleanup id to queue - void EnqueueCleanupId(void* id); + /// Add cleanup id to queue. This is used for cleaning up the infer_payload + /// and the response factory for BLS decoupled response. + void EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type); /// Add request cancellation query to queue void EnqueueIsCancelled(PbCancel* pb_cancel); diff --git a/src/python_be.cc b/src/python_be.cc index 6de5bcf3..8dfa72b1 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -379,21 +379,7 @@ ModelInstanceState::SaveRequestsToSharedMemory( std::unique_ptr infer_request; if (model_state->IsDecoupled()) { TRITONBACKEND_ResponseFactory* factory_ptr; - // Reuse the response factory if there is already a response factory - // associated with the request - std::lock_guard guard{response_factory_map_mutex_}; - { - if (response_factory_map_.find(reinterpret_cast(request)) != - response_factory_map_.end()) { - factory_ptr = - response_factory_map_[reinterpret_cast(request)]; - } else { - RETURN_IF_ERROR( - TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); - response_factory_map_[reinterpret_cast(request)] = - factory_ptr; - } - } + RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, @@ -843,7 +829,8 @@ ModelInstanceState::StubToParentMQMonitor() ProcessLogRequest(message); break; } - case PYTHONSTUB_CleanupRequest: { + case PYTHONSTUB_BLSDecoupledInferPayloadCleanup: + case PYTHONSTUB_BLSDecoupledResponseFactoryCleanup: { ProcessBLSCleanupRequest(message); break; } @@ -941,9 +928,17 @@ ModelInstanceState::ProcessBLSCleanupRequest( Stub()->ShmPool()->Load(message->Args()); CleanupMessage* cleanup_message_ptr = reinterpret_cast(cleanup_request_message.data_.get()); - - void* id = cleanup_message_ptr->id; - infer_payload_.erase(reinterpret_cast(id)); + intptr_t id = reinterpret_cast(cleanup_message_ptr->id); + if (message->Command() == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { + // Remove the InferPayload object from the map. + infer_payload_.erase(id); + } else if ( + message->Command() == PYTHONSTUB_BLSDecoupledResponseFactoryCleanup) { + // Delete response factory + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + response_factory(reinterpret_cast(id)); + } { bi::scoped_lock lock{*(message->ResponseMutex())}; @@ -1172,12 +1167,6 @@ ModelInstanceState::ResponseSendDecoupled( std::lock_guard guard{closed_requests_mutex_}; closed_requests_.push_back(send_message_payload->request_address); } - - // Clean up the response factory map. - { - std::lock_guard guard{response_factory_map_mutex_}; - response_factory_map_.erase(send_message_payload->request_address); - } } if (send_message_payload->response != 0) { @@ -1195,14 +1184,7 @@ ModelInstanceState::ResponseSendDecoupled( error_message); std::vector, void*>> gpu_output_buffers; - std::unique_ptr< - TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> - response_factory_ptr; GPUBuffersHelper gpu_buffer_helper; - if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - response_factory_ptr.reset( - reinterpret_cast(response_factory)); - } #ifdef TRITON_ENABLE_GPU for (auto& output_tensor : infer_response->OutputTensors()) { @@ -1289,13 +1271,6 @@ ModelInstanceState::ResponseSendDecoupled( response_factory, send_message_payload->flags); SetErrorForResponseSendMessage( send_message_payload, WrapTritonErrorInSharedPtr(error), error_message); - - if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - std::unique_ptr< - TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> - response_factory(reinterpret_cast( - send_message_payload->response_factory_address)); - } } } @@ -1368,11 +1343,6 @@ ModelInstanceState::ProcessRequestsDecoupled( TRITONSERVER_ERROR_INTERNAL, error->String().c_str()); } - // Reset the release flags for all the requests. - for (auto& infer_request : pb_infer_requests) { - infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - } - return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to process the requests."); } @@ -2499,15 +2469,9 @@ TRITONBACKEND_ModelInstanceExecute( } } - // We should only delete the response factory for the requests that have - // not been closed. for (auto& infer_request : infer_requests) { - if (!instance_state->ExistsInClosedRequests( - infer_request->RequestAddress())) { - LOG_IF_ERROR( - infer_request->DeleteResponseFactory(), - "Failed to delete the response factory."); - } + // Reset the release flags for all the requests. + infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); } } } diff --git a/src/python_be.h b/src/python_be.h index 2fc755ca..e644e159 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -288,9 +288,6 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr thread_pool_; std::unordered_map> infer_payload_; std::unique_ptr request_executor_; - std::mutex response_factory_map_mutex_; - std::unordered_map - response_factory_map_; public: static TRITONSERVER_Error* Create( @@ -403,7 +400,8 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr* infer_response, bi::managed_external_buffer::handle_t* response_handle); - // Process the bls decoupled cleanup request + // Process the bls decoupled cleanup request for InferPayload and + // ResponseFactory void ProcessBLSCleanupRequest(const std::unique_ptr& message); // Process request cancellation query diff --git a/src/response_sender.cc b/src/response_sender.cc index c6b8f788..fe06e554 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -45,6 +45,13 @@ ResponseSender::ResponseSender( { } +ResponseSender::~ResponseSender() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + stub->EnqueueCleanupId( + reinterpret_cast(response_factory_address_), + PYTHONSTUB_BLSDecoupledResponseFactoryCleanup); +} void ResponseSender::Send( diff --git a/src/response_sender.h b/src/response_sender.h index fda0d5d3..d29a6ab6 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -38,6 +38,7 @@ class ResponseSender { intptr_t request_address, intptr_t response_factory_address, std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel); + ~ResponseSender(); void Send(std::shared_ptr response, const uint32_t flags); bool IsCancelled(); From 7551f036fead433ab29edc21dd58e6ccc10b2daa Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 18 Dec 2023 10:22:51 -0500 Subject: [PATCH 04/54] Fix warning for GPU tensors (#330) --- src/infer_response.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/infer_response.cc b/src/infer_response.cc index 09737b26..5a898a7e 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -211,6 +211,10 @@ InferResponse::Send( std::vector, void*>>& output_buffers, const std::set& requested_output_names) { +#ifdef TRITON_ENABLE_GPU + static bool log_warning = true; +#endif // TRITON_ENABLE_GPU + std::shared_ptr response_error = WrapTritonErrorInSharedPtr(nullptr); std::unique_ptr response_error_handling; @@ -249,11 +253,6 @@ InferResponse::Send( } bool cuda_copy = false; -#ifdef TRITON_ENABLE_GPU - // This variable is used to avoid printing the same message multiple times - // when the output tensor is failed to be allocated from the CUDA memory pool. - bool log_warning = true; -#endif // TRITON_ENABLE_GPU for (auto& output_tensor : OutputTensors()) { // FIXME: for decoupled models we will skip the requested output names. From 950c47f0f989ae757136ff7d6441d653d6009de1 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Mon, 18 Dec 2023 16:16:18 -0800 Subject: [PATCH 05/54] Update name of ipc message type (#329) --- src/ipc_message.h | 2 +- src/pb_stub.cc | 2 +- src/python_be.cc | 9 ++++----- src/python_be.h | 5 ++--- src/response_sender.cc | 2 +- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/ipc_message.h b/src/ipc_message.h index 866070f6..ac28238c 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -55,7 +55,7 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_AutoCompleteResponse, PYTHONSTUB_LogRequest, PYTHONSTUB_BLSDecoupledInferPayloadCleanup, - PYTHONSTUB_BLSDecoupledResponseFactoryCleanup, + PYTHONSTUB_DecoupledResponseFactoryCleanup, PYTHONSTUB_MetricFamilyRequestNew, PYTHONSTUB_MetricFamilyRequestDelete, PYTHONSTUB_MetricRequestNew, diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 53a6c540..d1f8f6fd 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -997,7 +997,7 @@ Stub::ServiceStubToParentRequests() (utils_msg_payload->command_type == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) || (utils_msg_payload->command_type == - PYTHONSTUB_BLSDecoupledResponseFactoryCleanup)) { + PYTHONSTUB_DecoupledResponseFactoryCleanup)) { SendCleanupId(utils_msg_payload, utils_msg_payload->command_type); } else if ( utils_msg_payload->command_type == PYTHONSTUB_IsRequestCancelled) { diff --git a/src/python_be.cc b/src/python_be.cc index 8dfa72b1..3c9dd19d 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -830,8 +830,8 @@ ModelInstanceState::StubToParentMQMonitor() break; } case PYTHONSTUB_BLSDecoupledInferPayloadCleanup: - case PYTHONSTUB_BLSDecoupledResponseFactoryCleanup: { - ProcessBLSCleanupRequest(message); + case PYTHONSTUB_DecoupledResponseFactoryCleanup: { + ProcessCleanupRequest(message); break; } case PYTHONSTUB_IsRequestCancelled: { @@ -921,7 +921,7 @@ ModelInstanceState::ProcessLogRequest( } void -ModelInstanceState::ProcessBLSCleanupRequest( +ModelInstanceState::ProcessCleanupRequest( const std::unique_ptr& message) { AllocatedSharedMemory cleanup_request_message = @@ -932,8 +932,7 @@ ModelInstanceState::ProcessBLSCleanupRequest( if (message->Command() == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { // Remove the InferPayload object from the map. infer_payload_.erase(id); - } else if ( - message->Command() == PYTHONSTUB_BLSDecoupledResponseFactoryCleanup) { + } else if (message->Command() == PYTHONSTUB_DecoupledResponseFactoryCleanup) { // Delete response factory std::unique_ptr< TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> diff --git a/src/python_be.h b/src/python_be.h index e644e159..f5620d07 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -400,9 +400,8 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr* infer_response, bi::managed_external_buffer::handle_t* response_handle); - // Process the bls decoupled cleanup request for InferPayload and - // ResponseFactory - void ProcessBLSCleanupRequest(const std::unique_ptr& message); + // Process the decoupled cleanup request for InferPayload and ResponseFactory + void ProcessCleanupRequest(const std::unique_ptr& message); // Process request cancellation query void ProcessIsRequestCancelled(const std::unique_ptr& message); diff --git a/src/response_sender.cc b/src/response_sender.cc index fe06e554..94e3f0c8 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -50,7 +50,7 @@ ResponseSender::~ResponseSender() std::unique_ptr& stub = Stub::GetOrCreateInstance(); stub->EnqueueCleanupId( reinterpret_cast(response_factory_address_), - PYTHONSTUB_BLSDecoupledResponseFactoryCleanup); + PYTHONSTUB_DecoupledResponseFactoryCleanup); } void From 2bdb14c03011c618ddd5e8080d70052c34b19a9f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 8 Jan 2024 14:33:54 -0800 Subject: [PATCH 06/54] Move from jfrog artifactory to archives.boost.io to fix boost download (#334) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 54341e01..6fae6a00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,7 +100,7 @@ FetchContent_MakeAvailable(dlpack) # ExternalProject_Add( boostorg - URL https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz + URL https://archives.boost.io/release/1.79.0/source/boost_1_79_0.tar.gz URL_HASH SHA256=273f1be93238a068aba4f9735a4a2b003019af067b9c183ed227780b8f36062c PREFIX "boost-src" CONFIGURE_COMMAND ${CMAKE_COMMAND} -E copy_directory From 4ee0fce531eb6e0aa793d895101846115518ea5c Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 11 Jan 2024 11:57:35 -0800 Subject: [PATCH 07/54] Clean up response iterator map properly (#335) --- src/pb_stub.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index d1f8f6fd..a7d39852 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -372,6 +372,14 @@ Stub::RunCommand() } break; case PYTHONSTUB_CommandType::PYTHONSTUB_FinalizeRequest: ipc_message->Command() = PYTHONSTUB_FinalizeResponse; + // Clean up response_iterator_map_ before sending sending message back to + // the parent process to make sure that the clean up message can be + // processed before the message queue is destroyed. + { + std::lock_guard lock(response_iterator_map_mu_); + std::unordered_map>().swap( + response_iterator_map_); + } SendIPCMessage(ipc_message); return true; // Terminate the stub process case PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers: @@ -1049,7 +1057,7 @@ Stub::SendCleanupId( const PYTHONSTUB_CommandType& command_type) { void* id = utils_msg_payload->utils_message_ptr; - { + if (command_type == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { std::lock_guard lock(response_iterator_map_mu_); response_iterator_map_.erase(id); } From 980a5bb00c3b136e9464d7667718f462e083afb9 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Thu, 11 Jan 2024 12:02:19 -0800 Subject: [PATCH 08/54] Bumping min required cxx standard to 17 (#332) --- CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fae6a00..2b47df1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,9 @@ cmake_minimum_required(VERSION 3.17) project(tritonpythonbackend LANGUAGES C CXX) +# Use C++17 standard as Triton's minimum required. +set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.") + # # Options # @@ -231,14 +234,14 @@ add_library( TritonPythonBackend::triton-python-backend ALIAS triton-python-backend ) -target_compile_features(triton-python-backend PRIVATE cxx_std_11) +target_compile_features(triton-python-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-python-backend PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> ) -target_compile_features(triton-python-backend-stub PRIVATE cxx_std_11) +target_compile_features(triton-python-backend-stub PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-python-backend-stub PRIVATE $<$,$,$>: From 9d67dc39d2e42658c650525eccc836b2e991627b Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:21:50 -0800 Subject: [PATCH 09/54] Changing cuda cxx flag (#338) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b47df1d..2be987cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ set(boostorg_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/boost/") if(${TRITON_ENABLE_GPU}) find_package(CUDAToolkit REQUIRED) message(STATUS "Using CUDA ${CUDA_VERSION}") - set(CUDA_NVCC_FLAGS -std=c++11) + set(CUDA_NVCC_FLAGS -std=c++${TRITON_MIN_CXX_STANDARD}) elseif() message(WARNING "TRITON_ENABLE_GPU is OFF, GPU Tensor support will be disabled") endif() # TRITON_ENABLE_GPU From 37d29025f8da7c81cf9b6d88f5ff4d44e389a732 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Fri, 19 Jan 2024 15:33:58 -0800 Subject: [PATCH 10/54] Improve decoupled shm handling (#337) * [DO NOT MERGE] Add shm trace util * [DO NOT MERGE] Expand shm leak util naming to ipc load * Revert "[DO NOT MERGE] Expand shm leak util naming to ipc load" This reverts commit 68906f2dd32fa70fe247321391ce26967d04ec5a. * Revert "[DO NOT MERGE] Add shm trace util" This reverts commit 37824ce137b009e0ef13b46f440e1f94c865180e. * Fix decoupled shared memory leak --- src/python_be.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python_be.cc b/src/python_be.cc index 3c9dd19d..a8dfab07 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -1328,6 +1328,7 @@ ModelInstanceState::ProcessRequestsDecoupled( AllocatedSharedMemory response_batch = Stub()->ShmPool()->Load(received_message_->Args()); + received_message_.reset(); uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); From 0371eb8f9ffd6e1f50ba5ceeee5da0d3cb1f6888 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Thu, 1 Feb 2024 10:06:23 -0800 Subject: [PATCH 11/54] Add double parameter handling (#333) * Support Double-Type Infer/Response Parameters --- src/python_be.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/python_be.cc b/src/python_be.cc index a8dfab07..befdd593 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -340,6 +340,9 @@ ModelInstanceState::SaveRequestsToSharedMemory( } else if (type == TRITONSERVER_PARAMETER_STRING) { std::string string = reinterpret_cast(vvalue); RETURN_IF_ERROR(parameters_json.AddString(name, string)); + } else if (type == TRITONSERVER_PARAMETER_DOUBLE) { + RETURN_IF_ERROR(parameters_json.AddDouble( + name, *(reinterpret_cast(vvalue)))); } else { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, From ba616e26c256f11c41f7249c6a55220af8becee9 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Thu, 8 Feb 2024 11:28:10 -0800 Subject: [PATCH 12/54] Python Backend Windows Support (#294) * Base Python Backend Support for Windows --- CMakeLists.txt | 138 ++++++++++++------- src/infer_request.h | 2 +- src/metric_family.cc | 4 +- src/pb_env.cc | 48 ++++--- src/pb_env.h | 7 + src/pb_preferred_memory.h | 4 +- src/pb_stub.cc | 115 +++++++++++----- src/pb_stub.h | 32 ++--- src/pb_utils.cc | 120 +++++++++++------ src/pb_utils.h | 13 +- src/python_be.cc | 79 ++++++----- src/python_be.h | 23 +++- src/request_executor.cc | 6 +- src/shm_manager.h | 6 +- src/stub_launcher.cc | 272 ++++++++++++++++++++++++++++++++------ src/stub_launcher.h | 21 ++- 16 files changed, 629 insertions(+), 261 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2be987cd..bc5387ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,12 @@ option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) +# FIXME: CI needs to enable the GPU flag. Python for window currently does not +# support GPU tensors. For simplicity, we will override this option here. +if(WIN32) + set(TRITON_ENABLE_GPU OFF CACHE BOOL "GPU disabled" FORCE) +endif() + set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") @@ -96,6 +102,9 @@ FetchContent_Declare( GIT_TAG "v0.8" GIT_SHALLOW ON ) +# Option must be set off so WIN32 build does not break +set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) +set(BUILD_MOCK OFF) FetchContent_MakeAvailable(dlpack) # @@ -129,7 +138,10 @@ if(${TRITON_ENABLE_NVTX}) endif() # TRITON_ENABLE_NVTX find_package(ZLIB REQUIRED) -find_package(Threads REQUIRED) + +if(NOT WIN32) + find_package(Threads REQUIRED) +endif() include_directories(${CMAKE_BINARY_DIR}) configure_file(src/libtriton_python.ldscript libtriton_python.ldscript COPYONLY) @@ -174,21 +186,21 @@ set( ) set( - PYTHON_BACKEND_SRCS - src/python_be.cc - src/python_be.h - src/pb_env.cc - src/pb_env.h - src/pb_metric_reporter.cc - src/pb_metric_reporter.h - src/memory_manager.cc - src/memory_manager.h - src/request_executor.cc - src/request_executor.h - src/stub_launcher.h - src/stub_launcher.cc - src/infer_payload.h - src/infer_payload.cc + PYTHON_BACKEND_SRCS + src/python_be.cc + src/python_be.h + src/pb_env.cc + src/pb_env.h + src/pb_metric_reporter.cc + src/pb_metric_reporter.h + src/memory_manager.cc + src/memory_manager.h + src/request_executor.cc + src/request_executor.h + src/stub_launcher.h + src/stub_launcher.cc + src/infer_payload.h + src/infer_payload.cc ) list(APPEND @@ -239,48 +251,82 @@ target_compile_options( triton-python-backend PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) target_compile_features(triton-python-backend-stub PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-python-backend-stub PRIVATE $<$,$,$>: - -fvisibility=hidden -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + -fvisibility=hidden -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) target_compile_definitions(triton-python-backend-stub PRIVATE TRITON_PB_STUB) -target_link_libraries( - triton-python-backend - PRIVATE +# For WIN32 do not link Threads and DL_LIBS +if(WIN32) + target_link_libraries( + triton-python-backend + PRIVATE + dlpack + triton-backend-utils # from repo-backend + -lrt # shared memory + triton-core-serverstub # from repo-core + ZLIB::ZLIB + -larchive + ) + + target_link_libraries( + triton-python-backend-stub + PRIVATE + dlpack + triton-backend-utils # from repo-backend + pybind11::embed + -lrt # shared memory + -larchive # libarchive + ) +else() + target_link_libraries( + triton-python-backend + PRIVATE + dlpack + Threads::Threads + triton-backend-utils # from repo-backend + ${CMAKE_DL_LIBS} # dlopen and dlclose + -lrt # shared memory + triton-core-serverstub # from repo-core + ZLIB::ZLIB + -larchive + ) + + target_link_libraries( + triton-python-backend-stub + PRIVATE dlpack Threads::Threads - triton-backend-utils # from repo-backend - ${CMAKE_DL_LIBS} # dlopen and dlclose - -lrt # shared memory - triton-core-serverstub # from repo-core - ZLIB::ZLIB - -larchive -) - -target_link_libraries( - triton-python-backend-stub - PRIVATE - dlpack - Threads::Threads - triton-backend-utils # from repo-backend - ${CMAKE_DL_LIBS} # dlopen and dlclose - pybind11::embed - -lrt # shared memory - -larchive # libarchive -) + triton-backend-utils # from repo-backend + ${CMAKE_DL_LIBS} # dlopen and dlclose + pybind11::embed + -lrt # shared memory + -larchive # libarchive + ) +endif() -set_target_properties( - triton-python-backend PROPERTIES - POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_python - LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_python.ldscript - LINK_FLAGS "-Wl,--version-script libtriton_python.ldscript" -) +if(WIN32) + set_target_properties( + triton-python-backend PROPERTIES + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME triton_python + ) +else() + set_target_properties( + triton-python-backend PROPERTIES + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME triton_python + LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_python.ldscript + LINK_FLAGS "-Wl,--version-script libtriton_python.ldscript" + ) +endif() add_subdirectory(./src/shm_monitor) diff --git a/src/infer_request.h b/src/infer_request.h index b8dee87c..ba586535 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -87,7 +87,7 @@ class InferRequest { const uint64_t timeout = 0, const intptr_t response_factory_address = 0, const intptr_t request_address = 0, const PreferredMemory& preferred_memory = - PreferredMemory(PreferredMemory::DEFAULT, 0), + PreferredMemory(PreferredMemory::kDefault, 0), const InferenceTrace& trace = InferenceTrace()); const std::vector>& Inputs(); diff --git a/src/metric_family.cc b/src/metric_family.cc index fb0fb93a..77e8aedf 100644 --- a/src/metric_family.cc +++ b/src/metric_family.cc @@ -201,9 +201,9 @@ TRITONSERVER_MetricKind MetricFamily::ToTritonServerMetricKind(const MetricKind& kind) { switch (kind) { - case COUNTER: + case kCounter: return TRITONSERVER_METRIC_KIND_COUNTER; - case GAUGE: + case kGauge: return TRITONSERVER_METRIC_KIND_GAUGE; default: throw PythonBackendException("Unknown metric kind"); diff --git a/src/pb_env.cc b/src/pb_env.cc index 0b6eb9ec..d9643a62 100644 --- a/src/pb_env.cc +++ b/src/pb_env.cc @@ -26,9 +26,11 @@ #include "pb_env.h" +#ifndef _WIN32 #include #include #include +#endif #include #include @@ -40,6 +42,29 @@ namespace triton { namespace backend { namespace python { +bool +FileExists(std::string& path) +{ + struct stat buffer; + return stat(path.c_str(), &buffer) == 0; +} + +void +LastModifiedTime(const std::string& path, time_t* last_modified_time) +{ + struct stat result; + if (stat(path.c_str(), &result) == 0) { + *last_modified_time = result.st_mtime; + } else { + throw PythonBackendException(std::string( + "LastModifiedTime() failed as file \'" + path + + std::string("\' does not exists."))); + } +} + +// FIXME: [DLIS-5969]: Develop platforom-agnostic functions +// to support custom python environments. +#ifndef _WIN32 void CopySingleArchiveEntry(archive* input_archive, archive* output_archive) { @@ -73,7 +98,6 @@ CopySingleArchiveEntry(archive* input_archive, archive* output_archive) } } - void ExtractTarFile(std::string& archive_path, std::string& dst_path) { @@ -153,27 +177,6 @@ ExtractTarFile(std::string& archive_path, std::string& dst_path) } } -bool -FileExists(std::string& path) -{ - struct stat buffer; - return stat(path.c_str(), &buffer) == 0; -} - -void -LastModifiedTime(const std::string& path, time_t* last_modified_time) -{ - struct stat result; - if (stat(path.c_str(), &result) == 0) { - *last_modified_time = result.st_mtime; - } else { - throw PythonBackendException(std::string( - "LastModifiedTime() failed as file \'" + path + - std::string("\' does not exists."))); - } -} - - void RecursiveDirectoryDelete(const char* dir) { @@ -326,5 +329,6 @@ EnvironmentManager::~EnvironmentManager() { RecursiveDirectoryDelete(base_path_); } +#endif }}} // namespace triton::backend::python diff --git a/src/pb_env.h b/src/pb_env.h index 09890ee8..04e01fa3 100644 --- a/src/pb_env.h +++ b/src/pb_env.h @@ -30,6 +30,11 @@ #include #include +#ifdef WIN32 +#include +#undef PATH_MAX +#define PATH_MAX MAX_PATH +#endif namespace triton { namespace backend { namespace python { void ExtractTarFile(std::string& archive_path, std::string& dst_path); @@ -39,6 +44,7 @@ bool FileExists(std::string& path); // // A class that manages Python environments // +#ifndef _WIN32 class EnvironmentManager { std::map> env_map_; char base_path_[PATH_MAX + 1]; @@ -52,5 +58,6 @@ class EnvironmentManager { std::string ExtractIfNotExtracted(std::string env_path); ~EnvironmentManager(); }; +#endif }}} // namespace triton::backend::python diff --git a/src/pb_preferred_memory.h b/src/pb_preferred_memory.h index 55f4db89..c28f1b87 100644 --- a/src/pb_preferred_memory.h +++ b/src/pb_preferred_memory.h @@ -30,10 +30,10 @@ namespace triton { namespace backend { namespace python { class PreferredMemory { public: - enum MemoryType { GPU, CPU, DEFAULT }; + enum MemoryType { kGPU, kCPU, kDefault }; PreferredMemory() - : preferred_memory_type_(MemoryType::DEFAULT), preferred_device_id_(0) + : preferred_memory_type_(MemoryType::kDefault), preferred_device_id_(0) { } diff --git a/src/pb_stub.cc b/src/pb_stub.cc index a7d39852..26003f71 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -28,7 +28,6 @@ #include #include -#include #include #include @@ -55,6 +54,13 @@ #include "shm_manager.h" #include "triton/common/nvtx.h" +#ifdef _WIN32 +#include // SIGINT & SIGTERM +#include +#else +#include +#endif + #ifdef TRITON_ENABLE_GPU #include #endif // TRITON_ENABLE_GPU @@ -148,6 +154,7 @@ Stub::Instantiate( // interfere with the shared library resolution of other executable and // binaries. if (ipc_control_->uses_env) { +#ifndef _WIN32 char* ld_library_path = std::getenv("LD_LIBRARY_PATH"); if (ld_library_path != nullptr) { @@ -173,6 +180,11 @@ Stub::Instantiate( "When using an execution environment, LD_LIBRARY_PATH variable " "cannot be empty."); } +#else + throw PythonBackendException( + "Custom execution environments are not currently supported on " + "Windows."); +#endif } } catch (const PythonBackendException& pb_exception) { @@ -1444,10 +1456,22 @@ Logger::Log( // and pass messages to cerr if (!BackendLoggingActive()) { std::string path(filename); - size_t pos = path.rfind('/'); + size_t pos = path.rfind(std::filesystem::path::preferred_separator); if (pos != std::string::npos) { path = path.substr(pos + 1, std::string::npos); } +#ifdef _WIN32 + std::stringstream ss; + SYSTEMTIME system_time; + GetSystemTime(&system_time); + ss << LeadingLogChar(level) << std::setfill('0') << std::setw(2) + << system_time.wMonth << std::setw(2) << system_time.wDay << ' ' + << std::setw(2) << system_time.wHour << ':' << std::setw(2) + << system_time.wMinute << ':' << std::setw(2) << system_time.wSecond + << '.' << std::setw(6) << system_time.wMilliseconds * 1000 << ' ' + << static_cast(GetCurrentProcessId()) << ' ' << path << ':' + << lineno << "] "; +#else std::stringstream ss; struct timeval tv; gettimeofday(&tv, NULL); @@ -1460,6 +1484,7 @@ Logger::Log( << std::setw(6) << tv.tv_usec << ' ' << static_cast(getpid()) << ' ' << path << ':' << lineno << "] "; std::cerr << ss.str() << " " << message << std::endl; +#endif } else { // Ensure we do not create a stub instance before it has initialized std::unique_ptr& stub = Stub::GetOrCreateInstance(); @@ -1471,37 +1496,37 @@ Logger::Log( void Logger::LogInfo(const std::string& message) { - Logger::Log(message, LogLevel::INFO); + Logger::Log(message, LogLevel::kInfo); } void Logger::LogWarn(const std::string& message) { - Logger::Log(message, LogLevel::WARNING); + Logger::Log(message, LogLevel::kWarning); } void Logger::LogError(const std::string& message) { - Logger::Log(message, LogLevel::ERROR); + Logger::Log(message, LogLevel::kError); } void Logger::LogVerbose(const std::string& message) { - Logger::Log(message, LogLevel::VERBOSE); + Logger::Log(message, LogLevel::kVerbose); } const std::string Logger::LeadingLogChar(const LogLevel& level) { switch (level) { - case LogLevel::WARNING: + case LogLevel::kWarning: return "W"; - case LogLevel::ERROR: + case LogLevel::kError: return "E"; - case LogLevel::INFO: - case LogLevel::VERBOSE: + case LogLevel::kInfo: + case LogLevel::kVerbose: default: return "I"; } @@ -1580,8 +1605,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("preferred_device_id").none(false) = 0); py::enum_(module, "MemoryType") - .value("TRITONSERVER_MEMORY_GPU", PreferredMemory::MemoryType::GPU) - .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::CPU) + .value("TRITONSERVER_MEMORY_GPU", PreferredMemory::MemoryType::kGPU) + .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::kCPU) .export_values(); py::class_>( @@ -1637,7 +1662,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("model_version").none(false) = -1, py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0, py::arg("preferred_memory").none(false) = - PreferredMemory(PreferredMemory::DEFAULT, 0), + PreferredMemory(PreferredMemory::kDefault, 0), py::arg("trace").none(false) = InferenceTrace(), py::arg("parameters").none(true) = py::none()) .def( @@ -1758,14 +1783,14 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::class_ logger(module, "Logger"); py::enum_(logger, "LogLevel") - .value("INFO", LogLevel::INFO) - .value("WARNING", LogLevel::WARNING) - .value("ERROR", LogLevel::ERROR) - .value("VERBOSE", LogLevel::VERBOSE) + .value("INFO", LogLevel::kInfo) + .value("WARNING", LogLevel::kWarning) + .value("ERROR", LogLevel::kError) + .value("VERBOSE", LogLevel::kVerbose) .export_values(); logger.def_static( "log", py::overload_cast(&Logger::Log), - py::arg("message"), py::arg("level") = LogLevel::INFO); + py::arg("message"), py::arg("level") = LogLevel::kInfo); logger.def_static("log_info", &Logger::LogInfo, py::arg("message")); logger.def_static("log_warn", &Logger::LogWarn, py::arg("message")); logger.def_static("log_error", &Logger::LogError, py::arg("message")); @@ -1777,8 +1802,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("value", &Metric::SendGetValueRequest); py::enum_(module, "MetricKind") - .value("COUNTER", MetricKind::COUNTER) - .value("GAUGE", MetricKind::GAUGE) + .value("COUNTER", MetricKind::kCounter) + .value("GAUGE", MetricKind::kGauge) .export_values(); py::class_>( @@ -1790,8 +1815,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def( "Metric", &MetricFamily::CreateMetric, py::arg("labels").none(true) = py::none()); - module.attr("MetricFamily").attr("COUNTER") = MetricKind::COUNTER; - module.attr("MetricFamily").attr("GAUGE") = MetricKind::GAUGE; + module.attr("MetricFamily").attr("COUNTER") = MetricKind::kCounter; + module.attr("MetricFamily").attr("GAUGE") = MetricKind::kGauge; module.def( "load_model", &LoadModel, py::arg("model_name").none(false), @@ -1819,12 +1844,13 @@ ModelContext::Init( const std::string& model_path, const std::string& runtime_modeldir, const std::string& triton_install_path, const std::string& model_version) { - type_ = ModelType::DEFAULT; + const char os_slash = std::filesystem::path::preferred_separator; + type_ = ModelType::kDefault; if (runtime_modeldir != "DEFAULT") { // For python based backends, existence of `model.py` in the corresponding // backend folder happens on the core side, so we can omit this check here. - python_model_path_ = runtime_modeldir + "/model.py"; - type_ = ModelType::BACKEND; + python_model_path_ = runtime_modeldir + os_slash + "model.py"; + type_ = ModelType::kBackend; } else { python_model_path_ = model_path; // Check if model file exists in this path. @@ -1835,7 +1861,7 @@ ModelContext::Init( } } - model_dir_ = model_path.substr(0, model_path.find_last_of("\\/")); + model_dir_ = model_path.substr(0, model_path.find_last_of(os_slash)); python_backend_folder_ = triton_install_path; model_version_ = model_version; runtime_modeldir_ = runtime_modeldir; @@ -1844,8 +1870,9 @@ ModelContext::Init( void ModelContext::StubSetup(py::module& sys) { + const char os_slash = std::filesystem::path::preferred_separator; std::string model_name = - python_model_path_.substr(python_model_path_.find_last_of("/") + 1); + python_model_path_.substr(python_model_path_.find_last_of(os_slash) + 1); // Model name without the .py extension auto dotpy_pos = model_name.find_last_of(".py"); @@ -1858,11 +1885,11 @@ ModelContext::StubSetup(py::module& sys) // returned by 'find_last_of'. Need to manually adjust the position. std::string model_name_trimmed = model_name.substr(0, dotpy_pos - 2); - if (type_ == ModelType::DEFAULT) { + if (type_ == ModelType::kDefault) { std::string model_path_parent = - python_model_path_.substr(0, python_model_path_.find_last_of("/")); + python_model_path_.substr(0, python_model_path_.find_last_of(os_slash)); std::string model_path_parent_parent = - model_path_parent.substr(0, model_path_parent.find_last_of("/")); + model_path_parent.substr(0, model_path_parent.find_last_of(os_slash)); sys.attr("path").attr("append")(model_path_parent); sys.attr("path").attr("append")(model_path_parent_parent); sys.attr("path").attr("append")(python_backend_folder_); @@ -1870,7 +1897,7 @@ ModelContext::StubSetup(py::module& sys) (std::string(model_version_) + "." + model_name_trimmed).c_str()); } else { std::string model_path_parent = - python_model_path_.substr(0, python_model_path_.find_last_of("/")); + python_model_path_.substr(0, python_model_path_.find_last_of(os_slash)); std::string backend_model_dir(model_path_parent); sys.attr("path").attr("append")(backend_model_dir); sys.attr("path").attr("append")(python_backend_folder_); @@ -1878,6 +1905,22 @@ ModelContext::StubSetup(py::module& sys) } } +#ifdef _WIN32 +bool +ParentProcessActive(DWORD parent_id) +{ + HANDLE parent = OpenProcess(PROCESS_ALL_ACCESS, FALSE, parent_id); + DWORD exit_code; + GetExitCodeProcess(parent, &exit_code); + return (exit_code == STILL_ACTIVE); +} +#else +bool +ParentProcessActive(pid_t parent_id) +{ + return (kill(parent_id, 0) == 0); +} +#endif extern "C" { @@ -1902,8 +1945,9 @@ main(int argc, char** argv) // Find the package name from model path. size_t prev = 0, pos = 0; + const char os_slash = std::filesystem::path::preferred_separator; do { - pos = model_path.find("/", prev); + pos = model_path.find(os_slash, prev); if (pos == std::string::npos) pos = model_path.length(); std::string token = model_path.substr(prev, pos - prev); @@ -1938,8 +1982,11 @@ main(int argc, char** argv) // Start the Python Interpreter py::scoped_interpreter guard{}; +#ifdef _WIN32 + DWORD parent_pid = (DWORD)std::stoul(argv[5]); +#else pid_t parent_pid = std::stoi(argv[5]); - +#endif std::atomic background_thread_running = {true}; std::thread background_thread = std::thread([&parent_pid, &background_thread_running, &stub, &logger] { @@ -1958,7 +2005,7 @@ main(int argc, char** argv) stub->UpdateHealth(); - if (kill(parent_pid, 0) != 0) { + if (!ParentProcessActive(parent_pid)) { // When unhealthy, we should stop attempting to send // messages to the backend ASAP. if (stub->StubToParentServiceActive()) { diff --git a/src/pb_stub.h b/src/pb_stub.h index 74a66b95..a51f25f5 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -30,18 +30,7 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include "infer_request.h" #include "infer_response.h" @@ -81,17 +70,17 @@ namespace triton { namespace backend { namespace python { } while (false) /// Macros that use current filename and line number. -#define LOG_INFO LOG_FL(__FILE__, __LINE__, LogLevel::INFO) -#define LOG_WARN LOG_FL(__FILE__, __LINE__, LogLevel::WARNING) -#define LOG_ERROR LOG_FL(__FILE__, __LINE__, LogLevel::ERROR) -#define LOG_VERBOSE LOG_FL(__FILE__, __LINE__, LogLevel::VERBOSE) +#define LOG_INFO LOG_FL(__FILE__, __LINE__, LogLevel::kInfo) +#define LOG_WARN LOG_FL(__FILE__, __LINE__, LogLevel::kWarning) +#define LOG_ERROR LOG_FL(__FILE__, __LINE__, LogLevel::kError) +#define LOG_VERBOSE LOG_FL(__FILE__, __LINE__, LogLevel::kVerbose) class Logger { public: Logger() { backend_logging_active_ = false; }; ~Logger() { log_instance_.reset(); }; /// Python client log function - static void Log(const std::string& message, LogLevel level = LogLevel::INFO); + static void Log(const std::string& message, LogLevel level = LogLevel::kInfo); /// Python client log info function static void LogInfo(const std::string& message); @@ -138,7 +127,8 @@ class LogMessage { LogMessage(const char* file, int line, LogLevel level) : level_(level) { std::string path(file); - size_t pos = path.rfind('/'); + const char os_slash = std::filesystem::path::preferred_separator; + size_t pos = path.rfind(os_slash); if (pos != std::string::npos) { path = path.substr(pos + 1, std::string::npos); } @@ -185,10 +175,10 @@ class ModelContext { // Triton supports python-based backends, // i.e. backends that provide common `model.py`, that can be re-used // between different models. `ModelType` helps to differentiate - // between models running with c++ python backend (ModelType::DEFAULT) - // and models running with python-based backend (ModelType::BACKEND) + // between models running with c++ python backend (ModelType::kDefault) + // and models running with python-based backend (ModelType::kBackend) // at the time of ModelContext::StubSetup to properly set up paths. - enum ModelType { DEFAULT, BACKEND }; + enum ModelType { kDefault, kBackend }; ModelType type_; }; diff --git a/src/pb_utils.cc b/src/pb_utils.cc index 5aa95b8b..7bc17fa4 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -26,27 +26,14 @@ #include "pb_utils.h" -#include -#include +#ifdef _WIN32 +#include + +#include +#else #include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "scoped_defer.h" +#endif + #ifdef TRITON_ENABLE_GPU #include @@ -59,42 +46,43 @@ namespace triton { namespace backend { namespace python { CUDAHandler::CUDAHandler() { - dl_open_handle_ = dlopen("libcuda.so", RTLD_LAZY); + dl_open_handle_ = LoadSharedObject("libcuda.so"); // If libcuda.so is successfully opened, it must be able to find // "cuPointerGetAttribute", "cuGetErrorString", and // "cuDevicePrimaryCtxGetState" symbols. if (dl_open_handle_ != nullptr) { - void* cu_pointer_get_attribute_fn = - dlsym(dl_open_handle_, "cuPointerGetAttribute"); + void* cu_pointer_get_attribute_fn = LocateSymbol("cuPointerGetAttribute"); if (cu_pointer_get_attribute_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuPointerGetAttribute'. Error: ") + - dlerror()); + std::string("Failed to locate 'cuPointerGetAttribute'. Error: ") + + LocateSymbolError()); } *((void**)&cu_pointer_get_attribute_fn_) = cu_pointer_get_attribute_fn; - void* cu_get_error_string_fn = dlsym(dl_open_handle_, "cuGetErrorString"); + void* cu_get_error_string_fn = LocateSymbol("cuGetErrorString"); if (cu_get_error_string_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuGetErrorString'. Error: ") + - dlerror()); + std::string("Failed to locate 'cuGetErrorString'. Error: ") + + LocateSymbolError()); } *((void**)&cu_get_error_string_fn_) = cu_get_error_string_fn; - void* cu_init_fn = dlsym(dl_open_handle_, "cuInit"); + void* cu_init_fn = LocateSymbol("cuInit"); if (cu_init_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuInit'. Error: ") + dlerror()); + std::string("Failed to locate 'cuInit'. Error: ") + + LocateSymbolError()); } *((void**)&cu_init_fn_) = cu_init_fn; void* cu_device_primary_ctx_get_state_fn = - dlsym(dl_open_handle_, "cuDevicePrimaryCtxGetState"); + LocateSymbol("cuDevicePrimaryCtxGetState"); if (cu_device_primary_ctx_get_state_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuDevicePrimaryCtxGetState'. Error: ") + - dlerror()); + std::string( + "Failed to locate 'cuDevicePrimaryCtxGetState'. Error: ") + + LocateSymbolError()); } *((void**)&cu_device_primary_ctx_get_state_fn_) = cu_device_primary_ctx_get_state_fn; @@ -105,10 +93,7 @@ CUDAHandler::CUDAHandler() const char* error_string; (*cu_get_error_string_fn_)(cuda_err, &error_string); error_str_ = std::string("failed to call cuInit: ") + error_string; - int status = dlclose(dl_open_handle_); - if (status != 0) { - throw PythonBackendException("Failed to close the libcuda handle."); - } + CloseLibrary(); dl_open_handle_ = nullptr; } } @@ -215,13 +200,58 @@ CUDAHandler::MaybeSetDevice(int device) CUDAHandler::~CUDAHandler() noexcept(false) { if (dl_open_handle_ != nullptr) { - int status = dlclose(dl_open_handle_); - if (status != 0) { - throw PythonBackendException("Failed to close the libcuda handle."); - } + CloseLibrary(); + } +} + +void* +CUDAHandler::LoadSharedObject(const char* filename) +{ +#ifdef _WIN32 + // NOTE: 'nvcuda.dll' is a placeholder library. Apparently, this should be the + // equivalent library for Windows, but need to verify. + return LoadLibraryA("nvcuda.dll"); +#else + return dlopen("libcuda.so", RTLD_LAZY); +#endif +} + +void* +CUDAHandler::LocateSymbol(const char* symbol) +{ +#ifdef _WIN32 + return GetProcAddress(static_cast(dl_open_handle_), symbol); +#else + return dlsym(dl_open_handle_, symbol); +#endif +} + + +std::string +CUDAHandler::LocateSymbolError() +{ +#ifdef _WIN32 + return std::to_string(GetLastError()); +#else + return dlerror(); +#endif +} + +void +CUDAHandler::CloseLibrary() +{ + bool successful = true; +#ifdef _WIN32 + successful = (FreeLibrary(static_cast(dl_open_handle_)) != 0); +#else + successful = (dlclose(dl_open_handle_) == 0); +#endif + if (!successful) { + throw PythonBackendException("Failed to close the cuda library handle."); } } + ScopedSetDevice::ScopedSetDevice(int device) { device_ = device; @@ -258,6 +288,14 @@ IsUsingCUDAPool( #endif // TRITON_ENABLE_GPU +// FIXME: [DLIS-6078]: We should not need this function. However, some paths are +// being retrieved from core that are not platform-agnostic. +void +SanitizePath(std::string& path) +{ + std::replace(path.begin(), path.end(), '/', '\\'); +} + #ifndef TRITON_PB_STUB std::shared_ptr WrapTritonErrorInSharedPtr(TRITONSERVER_Error* error) diff --git a/src/pb_utils.h b/src/pb_utils.h index 0873eb03..6d5f21ce 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -29,7 +29,6 @@ #ifdef TRITON_ENABLE_GPU #include #endif // TRITON_ENABLE_GPU -#include #include #include @@ -167,9 +166,9 @@ struct ResponseBatch : SendMessageBase { uint32_t response_size; }; -enum LogLevel { INFO = 0, WARNING, ERROR, VERBOSE }; +enum LogLevel { kInfo = 0, kWarning, kError, kVerbose }; -enum MetricKind { COUNTER, GAUGE }; +enum MetricKind { kCounter = 0, kGauge }; struct LogSendMessage : SendMessageBase { bi::managed_external_buffer::handle_t filename; @@ -294,6 +293,10 @@ class CUDAHandler { int64_t memory_type_id, cudaIpcMemHandle_t* cuda_mem_handle, void** data_ptr); void CloseCudaHandle(int64_t memory_type_id, void* data_ptr); + void* LoadSharedObject(const char* filename); + void* LocateSymbol(const char* symbol); + std::string LocateSymbolError(); + void CloseLibrary(); /// Set the device only if the primary context has already been created for /// this device. Inspired from PyTorch's MaybeSetDevice. @@ -323,6 +326,10 @@ bool IsUsingCUDAPool( #endif // TRITON_ENABLE_GPU +// FIXME: [DLIS-6078]: We should not need this function. However, some paths are +// being retrieved from core that are not platform-agnostic. +void SanitizePath(std::string& path); + #ifndef TRITON_PB_STUB std::shared_ptr WrapTritonErrorInSharedPtr( TRITONSERVER_Error* error); diff --git a/src/python_be.cc b/src/python_be.cc index befdd593..0fa318ff 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -25,6 +25,8 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "python_be.h" +#include + #include "gpu_buffers.h" #include "infer_payload.h" #include "model_loader.h" @@ -367,12 +369,15 @@ ModelInstanceState::SaveRequestsToSharedMemory( uint32_t flags; RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(request, &flags)); + // Do not return if error in this case, because Triton core + // will return an error if tracing is disabled (see PYBE PR#295). TRITONSERVER_InferenceTrace* triton_trace; auto err = TRITONBACKEND_RequestTrace(request, &triton_trace); if (err != nullptr) { triton_trace = nullptr; TRITONSERVER_ErrorDelete(err); } + InferenceTrace trace = InferenceTrace(triton_trace); uint64_t request_timeout; @@ -389,14 +394,14 @@ ModelInstanceState::SaveRequestsToSharedMemory( model_state->Name(), model_state->Version(), parameters_string, flags, request_timeout, reinterpret_cast(factory_ptr), reinterpret_cast(request), - PreferredMemory(PreferredMemory::DEFAULT, 0), trace); + PreferredMemory(PreferredMemory::kDefault, 0), trace); } else { infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, request_timeout, 0 /* response_factory_address */, reinterpret_cast(request), - PreferredMemory(PreferredMemory::DEFAULT, 0), trace); + PreferredMemory(PreferredMemory::kDefault, 0), trace); } RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool())); @@ -884,25 +889,25 @@ ModelInstanceState::ProcessLogRequest( LogLevel level = pb_log_message->Level(); switch (level) { - case LogLevel::INFO: { + case LogLevel::kInfo: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_INFO, (filename.c_str()), line, (log_message.c_str())); break; } - case LogLevel::WARNING: { + case LogLevel::kWarning: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_WARN, (filename.c_str()), line, (log_message.c_str())); break; } - case LogLevel::ERROR: { + case LogLevel::kError: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_ERROR, (filename.c_str()), line, (log_message.c_str())); break; } - case LogLevel::VERBOSE: { + case LogLevel::kVerbose: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_VERBOSE, (filename.c_str()), line, (log_message.c_str())); @@ -1422,7 +1427,7 @@ ModelInstanceState::ProcessRequests( // This means that the stub process has exited and Python // backend failed to restart the stub process. - if (Stub()->StubPid() == 0) { + if (!Stub()->StubActive()) { const char* error_message = "The stub process has exited unexpectedly."; RespondErrorToAllRequests( error_message, responses, requests, request_count); @@ -2056,7 +2061,7 @@ ModelState::SetModelConfig() extern "C" { -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) { const char* cname; @@ -2239,27 +2244,33 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) .c_str()); // Use BackendArtifacts to determine the location of Python files - const char* location; + const char* clocation; TRITONBACKEND_ArtifactType artifact_type; RETURN_IF_ERROR( - TRITONBACKEND_BackendArtifacts(backend, &artifact_type, &location)); - + TRITONBACKEND_BackendArtifacts(backend, &artifact_type, &clocation)); + + const char os_slash = std::filesystem::path::preferred_separator; + std::string location(clocation); +#ifdef _WIN32 + const std::string stub_executable_name = "triton_python_backend_stub.exe"; + SanitizePath(location); + SanitizePath(default_backend_dir_string); +#else + const std::string stub_executable_name = "triton_python_backend_stub"; +#endif // Check if `triton_python_backend_stub` and `triton_python_backend_utils.py` // are located under `location`. - // DLIS-5596: Add forward slash to be platform agnostic - // (i.e. For Windows, we need to use backward slash). std::string default_python_backend_dir = - default_backend_dir_string + "/python"; - std::string backend_stub_path = - std::string(location) + "/triton_python_backend_stub"; + default_backend_dir_string + os_slash + "python"; + std::string backend_stub_path = location + os_slash + stub_executable_name; std::string backend_utils = - std::string(location) + "/triton_python_backend_utils.py"; + location + os_slash + "triton_python_backend_utils.py"; // Both, stub and utils should be in the same location if (FileExists(backend_stub_path) && FileExists(backend_utils)) { backend_state->python_lib = location; // If `location` is default location of a python backend, // then we are using default python backend. - if (default_python_backend_dir == std::string(location)) { + if (default_python_backend_dir == location) { backend_state->runtime_modeldir = ""; } else { // If `location` is not default location of a python backend, @@ -2272,22 +2283,26 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) // then we are using a python backend based backend and stub and utils are // stored in the default python backend location. if (!default_backend_dir_string.empty()) { - std::string backend_stub_path = - default_backend_dir_string + "/python/triton_python_backend_stub"; + std::string backend_stub_path = default_backend_dir_string + os_slash + + "python" + os_slash + + stub_executable_name; if (!FileExists(backend_stub_path)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_NOT_FOUND, - (std::string("triton_python_backend_stub") + - " is not found. Searched paths: " + default_backend_dir_string + - "/python and" + std::string(location)) + (stub_executable_name + " is not found. Searched paths: " + + default_backend_dir_string + os_slash + "python and " + location) .c_str()); } } backend_state->runtime_modeldir = location; - backend_state->python_lib = default_backend_dir_string + "/python"; + backend_state->python_lib = + default_backend_dir_string + os_slash + "python"; } - +// FIXME [DLIS-5969]: Enable for Windows when custom execution environments +// are supported. +#ifndef _WIN32 backend_state->env_manager = std::make_unique(); +#endif RETURN_IF_ERROR(TRITONBACKEND_BackendSetState( backend, reinterpret_cast(backend_state.get()))); @@ -2296,7 +2311,7 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend) { LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "TRITONBACKEND_Finalize: Start"); @@ -2308,7 +2323,7 @@ TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend) return nullptr; // success } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) { const char* cname; @@ -2335,7 +2350,7 @@ TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) { void* vstate; @@ -2351,7 +2366,7 @@ TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) { const char* cname; @@ -2394,7 +2409,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count) @@ -2519,7 +2534,7 @@ TRITONBACKEND_ModelInstanceExecute( return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) { void* vstate; @@ -2536,7 +2551,7 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_GetBackendAttribute( TRITONBACKEND_Backend* backend, TRITONBACKEND_BackendAttribute* backend_attributes) diff --git a/src/python_be.h b/src/python_be.h index f5620d07..4430767c 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -26,12 +26,8 @@ #pragma once -#include #include #include -#include -#include -#include #include #include @@ -84,6 +80,14 @@ #include "triton/core/tritonbackend.h" #include "triton/core/tritonserver.h" +#ifdef _WIN32 +#define NOMINMAX +#include +#else +#include +#include +#endif + #define LOG_IF_EXCEPTION(X) \ do { \ try { \ @@ -217,7 +221,12 @@ struct BackendState { std::atomic number_of_instance_inits; std::string shared_memory_region_prefix; int64_t thread_pool_size; + +// FIXME [DLIS-5969]: Enable for Windows when custom execution environments +// are supported. +#ifndef _WIN32 std::unique_ptr env_manager; +#endif std::string runtime_modeldir; }; @@ -299,7 +308,8 @@ class ModelInstanceState : public BackendModelInstance { // Launch stub process. TRITONSERVER_Error* LaunchStubProcess(); - TRITONSERVER_Error* SendMessageToStub(off_t message); + TRITONSERVER_Error* SendMessageToStub( + bi::managed_external_buffer::handle_t message); void ResponseSendDecoupled(std::shared_ptr response_send_message); // Checks whether the stub process is live @@ -307,7 +317,8 @@ class ModelInstanceState : public BackendModelInstance { // Get a message from the stub process void SendMessageAndReceiveResponse( - off_t message, off_t& response, bool& restart, + bi::managed_external_buffer::handle_t message, + bi::managed_external_buffer::handle_t& response, bool& restart, std::shared_ptr>& responses, TRITONBACKEND_Request** requests, const uint32_t request_count); diff --git a/src/request_executor.cc b/src/request_executor.cc index 65f53710..d78972a5 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -48,10 +48,10 @@ MemoryTypeToTritonMemoryType( const PreferredMemory::MemoryType& memory_type) { switch (memory_type) { - case PreferredMemory::MemoryType::CPU: + case PreferredMemory::MemoryType::kCPU: *triton_memory_type = TRITONSERVER_MEMORY_CPU; break; - case PreferredMemory::MemoryType::GPU: + case PreferredMemory::MemoryType::kGPU: *triton_memory_type = TRITONSERVER_MEMORY_GPU; break; @@ -202,7 +202,7 @@ ResponseAlloc( ScopedDefer _([&shm_pool] { shm_pool.release(); }); if (p->preferred_memory.PreferredMemoryType() == - PreferredMemory::MemoryType::DEFAULT) { + PreferredMemory::MemoryType::kDefault) { *actual_memory_type = preferred_memory_type; *actual_memory_type_id = preferred_memory_type_id; } else { diff --git a/src/shm_manager.h b/src/shm_manager.h index 5063273b..25e04570 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -26,8 +26,6 @@ #pragma once -#include - #include #include #include @@ -92,9 +90,9 @@ struct AllocatedSharedMemory { // info is placed in the beginning and the actual object is placed after that // (i.e. 4 plus the aligned address is not 16-bytes aligned). The aligned memory // is required by semaphore otherwise it may lead to SIGBUS error on ARM. -struct AllocatedShmOwnership { +struct alignas(16) AllocatedShmOwnership { uint32_t ref_count_; -} __attribute__((aligned(16))); +}; class SharedMemoryManager { public: diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index b0627486..a9956b55 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -26,12 +26,18 @@ #include "stub_launcher.h" +#include + #include "python_be.h" +#ifdef _WIN32 +#include // getpid() +#endif + namespace triton { namespace backend { namespace python { StubLauncher::StubLauncher(const std::string stub_process_kind) - : parent_pid_(0), stub_pid_(0), is_initialized_(false), + : parent_pid_(0), is_initialized_(false), stub_process_kind_(stub_process_kind), model_instance_name_(""), device_id_(0), kind_("") { @@ -40,8 +46,7 @@ StubLauncher::StubLauncher(const std::string stub_process_kind) StubLauncher::StubLauncher( const std::string stub_process_kind, const std::string model_instance_name, const int32_t device_id, const std::string kind) - : parent_pid_(0), stub_pid_(0), is_initialized_(false), - stub_process_kind_(stub_process_kind), + : is_initialized_(false), stub_process_kind_(stub_process_kind), model_instance_name_(model_instance_name), device_id_(device_id), kind_(kind) { @@ -65,6 +70,13 @@ StubLauncher::Initialize(ModelState* model_state) if (runtime_modeldir_.empty()) { runtime_modeldir_ = "DEFAULT"; } +#ifdef _WIN32 + ZeroMemory(&startup_info_, sizeof(startup_info_)); + startup_info_.cb = sizeof(startup_info_); + ZeroMemory(&stub_pid_, sizeof(stub_pid_)); +#else + stub_pid_ = 0; +#endif // Atomically increase and read the stub process count to avoid shared memory // region name collision @@ -76,7 +88,8 @@ StubLauncher::Initialize(ModelState* model_state) model_version_ = model_state->Version(); std::stringstream ss; - ss << model_repository_path_ << "/" << model_version_ << "/"; + const char os_slash = std::filesystem::path::preferred_separator; + ss << model_repository_path_ << os_slash << model_version_ << os_slash; std::string artifact_name; RETURN_IF_ERROR(model_state->ModelConfig().MemberAsString( "default_model_filename", &artifact_name)); @@ -89,31 +102,20 @@ StubLauncher::Initialize(ModelState* model_state) model_path_ = ss.str(); - // Path to the extracted Python env - std::string python_execution_env = ""; + // FIXME [DLIS-5969]: Enable for Windows when custom execution environments + // are supported. if (python_execution_env_ != "") { - try { - python_execution_env = - model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( - python_execution_env_); - } - catch (PythonBackendException& pb_exception) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); - } - - path_to_activate_ = python_execution_env + "/bin/activate"; - path_to_libpython_ = python_execution_env + "/lib"; - if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Path " + path_to_activate_ + - " does not exist. The Python environment should contain an " - "'activate' script.") - .c_str()); - } +#ifndef _WIN32 + RETURN_IF_ERROR(GetPythonEnvironment(model_state)); +#else + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "Custom execution environments are not currently supported on " + "Windows."); +#endif } + parent_pid_ = getpid(); return nullptr; @@ -195,6 +197,139 @@ StubLauncher::Setup() return nullptr; } +// FIXME: This should be merged with the Unix launch function once Windows +// CI and functionality are demonstrably stable. The goal of keeping the +// functions separate is to help debug Windows-specific issues without worrying +// about the impact to our Unix builds. +#ifdef _WIN32 +TRITONSERVER_Error* +StubLauncher::Launch() +{ + std::string stub_name; + if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + stub_name = model_name_; + } else { + stub_name = model_instance_name_; + } + + const char os_slash = std::filesystem::path::preferred_separator; + + const std::string stub_executable_name = "triton_python_backend_stub.exe"; + SanitizePath(model_path_); + SanitizePath(model_repository_path_); + + // Default Python backend stub + std::string python_backend_stub = + python_lib_ + os_slash + stub_executable_name; + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Stub path ") + python_backend_stub).c_str()); + + // Path to alternative Python backend stub + std::string model_python_backend_stub = + std::string(model_repository_path_) + os_slash + stub_executable_name; + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Alt path ") + python_backend_stub).c_str()); + + // Check if file exists + // TODO: Integrate win32 and pb_env + if (FileExists(model_python_backend_stub)) { + python_backend_stub = model_python_backend_stub; + } + + std::string launch_command; + + std::stringstream ss; + ss << python_backend_stub << " " << model_path_ << " " << shm_region_name_ + << " " << shm_default_byte_size_ << " " << shm_growth_byte_size_ << " " + << parent_pid_ << " " << python_lib_ << " " << ipc_control_handle_ << " " + << stub_name << " " << runtime_modeldir_; + launch_command = ss.str(); + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Starting Python backend stub: ") + launch_command).c_str()); + + LPSTR launch_command_lpstr = const_cast(launch_command.c_str()); + // Start the child process. Unlike fork(), the remainder of this + // function exists in the context of the parent, only. + if (!CreateProcess( + NULL, // No module name (use command line) + launch_command_lpstr, // Command line + NULL, // Process handle not inheritable + NULL, // Thread handle not inheritable + FALSE, // Set handle inheritance to FALSE + 0, // No creation flags + NULL, // Use parent's environment block + NULL, // Use parent's starting directory + &startup_info_, // Pointer to STARTUPINFO structure + &stub_pid_) // Pointer to PROCESS_INFORMATION structure + ) { + std::stringstream ss; + ss << "Failed to run python backend stub. Errno = " << errno << '\n' + << "Python backend stub path: " << python_backend_stub << '\n' + << "Shared Memory Region Name: " << shm_region_name_ << '\n' + << "Shared Memory Default Byte Size: " << shm_default_byte_size_ << '\n' + << "Shared Memory Growth Byte Size: " << shm_growth_byte_size_ << '\n'; + // Print the error message directly because the underlying mutexes in + // LOG_MESSAGE() could be forked when it is locked by other thread(s). + std::cerr << '\n' << ss.str() << '\n'; + _Exit(1); + } + ScopedDefer _([&] { + // Push a dummy message to the message queue so that the stub + // process is notified that it can release the object stored in + // shared memory. + stub_message_queue_->Push(DUMMY_MESSAGE); + + // If the model is not initialized, wait for the stub process to exit. + if (!is_initialized_) { + stub_message_queue_.reset(); + parent_message_queue_.reset(); + memory_manager_.reset(); + WaitForStubProcess(); + } + }); + + // The stub process would send two messages to the parent process during the + // initialization. + // 1. When the stub process's health monitoring thread has started. + // 2. When the initialization is fully completed and the Python model is + // loaded. + // + // The reason it is broken into two steps is that creation of the health + // monitoring thread may take longer which can make the server process think + // that the stub process is unhealthy and return early. Waiting until the + // health thread is spawn would make sure would prevent this issue. + parent_message_queue_->Pop(); + + if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + try { + AutocompleteStubProcess(); + } + catch (const PythonBackendException& ex) { + // Need to kill the stub process first + KillStubProcess(); + throw BackendModelException( + TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); + } + } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") { + RETURN_IF_ERROR(ModelInstanceStubProcess()); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Unknown stub_process_kind: ") + stub_process_kind_) + .c_str()); + } + + is_initialized_ = true; + + return nullptr; +} +#else TRITONSERVER_Error* StubLauncher::Launch() { @@ -307,11 +442,10 @@ StubLauncher::Launch() // If the model is not initialized, wait for the stub process to exit. if (!is_initialized_) { - int status; stub_message_queue_.reset(); parent_message_queue_.reset(); memory_manager_.reset(); - waitpid(stub_pid_, &status, 0); + WaitForStubProcess(); } }); @@ -335,10 +469,7 @@ StubLauncher::Launch() } catch (const PythonBackendException& ex) { // Need to kill the stub process first - kill(stub_pid_, SIGKILL); - int status; - waitpid(stub_pid_, &status, 0); - stub_pid_ = 0; + KillStubProcess(); throw BackendModelException( TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); } @@ -357,6 +488,34 @@ StubLauncher::Launch() return nullptr; } +TRITONSERVER_Error* +StubLauncher::GetPythonEnvironment(ModelState* model_state) +{ + std::string python_execution_env = ""; + try { + python_execution_env = + model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( + python_execution_env_); + } + catch (PythonBackendException& pb_exception) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + } + + path_to_activate_ = python_execution_env + "/bin/activate"; + path_to_libpython_ = python_execution_env + "/lib"; + if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Path " + path_to_activate_ + + " does not exist. The Python environment should contain an " + "'activate' script.") + .c_str()); + } + return nullptr; +} +#endif + void StubLauncher::AutocompleteStubProcess() { @@ -473,6 +632,18 @@ StubLauncher::ModelInstanceStubProcess() return nullptr; } +bool +StubLauncher::StubActive() +{ +#ifdef _WIN32 + DWORD ec; + GetExitCodeProcess(stub_pid_.hProcess, &ec); + return (ec == STILL_ACTIVE); +#else + return (stub_pid_ != 0); +#endif +} + void StubLauncher::UpdateHealth() { @@ -483,9 +654,13 @@ StubLauncher::UpdateHealth() ipc_control_->stub_health = false; } - // Sleep 1 second so that the child process has a chance to change the - // health variable +// Sleep 1 second so that the child process has a chance to change the +// health variable +#ifdef _WIN32 + Sleep(1); +#else sleep(1); +#endif { bi::scoped_lock lock(*health_mutex_); @@ -515,11 +690,11 @@ StubLauncher::TerminateStub() force_kill = true; } - int status; if (force_kill) { - kill(stub_pid_, SIGKILL); + KillStubProcess(); + } else { + WaitForStubProcess(); } - waitpid(stub_pid_, &status, 0); } // First destroy the IPCControl. This makes sure that IPCControl is @@ -540,10 +715,16 @@ StubLauncher::ClearQueues() void StubLauncher::KillStubProcess() { +#ifdef _WIN32 + unsigned int exit_code; + TerminateProcess(stub_pid_.hProcess, exit_code); + CloseHandle(stub_pid_.hProcess); + CloseHandle(stub_pid_.hThread); +#else kill(stub_pid_, SIGKILL); - int status; - waitpid(stub_pid_, &status, 0); + WaitForStubProcess(); stub_pid_ = 0; +#endif } TRITONSERVER_Error* @@ -600,6 +781,19 @@ StubLauncher::ReceiveMessageFromStub( return nullptr; // success } +void +StubLauncher::WaitForStubProcess() +{ +#ifdef _WIN32 + WaitForSingleObject(stub_pid_.hProcess, INFINITE); + CloseHandle(stub_pid_.hProcess); + CloseHandle(stub_pid_.hThread); +#else + int status; + waitpid(stub_pid_, &status, 0); +#endif +} + #ifdef TRITON_ENABLE_GPU void StubLauncher::ShareCUDAMemoryPool( diff --git a/src/stub_launcher.h b/src/stub_launcher.h index fbbbdbad..6c8dd910 100644 --- a/src/stub_launcher.h +++ b/src/stub_launcher.h @@ -26,8 +26,6 @@ #pragma once -#include - #include #include #include @@ -79,8 +77,8 @@ class StubLauncher { // Model instance stub process TRITONSERVER_Error* ModelInstanceStubProcess(); - // Stub PID - pid_t StubPid() { return stub_pid_; } + // Check if Stub PID is active + bool StubActive(); // Health mutex bi::interprocess_mutex* HealthMutex() { return health_mutex_; } @@ -151,6 +149,14 @@ class StubLauncher { TRITONSERVER_Error* ReceiveMessageFromStub( bi::managed_external_buffer::handle_t& message); + // Wait for stub process + void WaitForStubProcess(); + +#ifndef _WIN32 + // FIXME [DLIS-5969]: Enable for Windows when custom execution environments + // are supported. + TRITONSERVER_Error* GetPythonEnvironment(ModelState* model_state); +#endif #ifdef TRITON_ENABLE_GPU // Share CUDA memory pool with stub process void ShareCUDAMemoryPool( @@ -158,9 +164,14 @@ class StubLauncher { #endif // TRITON_ENABLE_GPU private: +#ifdef _WIN32 + STARTUPINFO startup_info_; + DWORD parent_pid_; + PROCESS_INFORMATION stub_pid_; +#else pid_t parent_pid_; pid_t stub_pid_; - +#endif bool is_initialized_; bool is_decoupled_; bool is_healthy_; From 34a4db57d971ab66bc2302a35f944ee9471508e2 Mon Sep 17 00:00:00 2001 From: Kyle McGill <101670481+nv-kmcgill53@users.noreply.github.com> Date: Wed, 6 Mar 2024 14:06:57 -0800 Subject: [PATCH 13/54] patching git repository parameterization from production branch 1 (#341) Co-authored-by: kyle --- CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bc5387ef..dacd0f9c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,7 @@ if(WIN32) set(TRITON_ENABLE_GPU OFF CACHE BOOL "GPU disabled" FORCE) endif() +set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") @@ -69,17 +70,17 @@ include(ExternalProject) FetchContent_Declare( repo-common - GIT_REPOSITORY https://github.com/triton-inference-server/common.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} ) FetchContent_Declare( repo-core - GIT_REPOSITORY https://github.com/triton-inference-server/core.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} ) FetchContent_Declare( repo-backend - GIT_REPOSITORY https://github.com/triton-inference-server/backend.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) From 0413e46bdbaca09541afa181586c60924ff18ae1 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 7 Mar 2024 09:55:44 +0800 Subject: [PATCH 14/54] Remove $ (#343) --- examples/auto_complete/README.md | 10 +++--- examples/bls/README.md | 38 +++++++++++----------- examples/decoupled/README.md | 14 ++++---- examples/jax/README.md | 24 +++++++------- examples/preprocessing/README.md | 56 ++++++++++++++++++++++++-------- inferentia/README.md | 24 +++++++------- 6 files changed, 97 insertions(+), 69 deletions(-) diff --git a/examples/auto_complete/README.md b/examples/auto_complete/README.md index f530da3a..b07e065c 100644 --- a/examples/auto_complete/README.md +++ b/examples/auto_complete/README.md @@ -1,5 +1,5 @@ + # **Preprocessing Using Python Backend Example** This example shows how to preprocess your inputs using Python backend before it is passed to the TensorRT model for inference. This ensemble model includes an image preprocessing model (preprocess) and a TensorRT model (resnet50_trt) to do inference. @@ -5,39 +33,39 @@ This example shows how to preprocess your inputs using Python backend before it Run onnx_exporter.py to convert ResNet50 PyTorch model to ONNX format. Width and height dims are fixed at 224 but dynamic axes arguments for dynamic batching are used. Commands from the 2. and 3. subsections shall be executed within this Docker container. - $ docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/pytorch:xx.yy-py3 bash - $ pip install numpy pillow torchvision - $ python onnx_exporter.py --save model.onnx + docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/pytorch:xx.yy-py3 bash + pip install numpy pillow torchvision + python onnx_exporter.py --save model.onnx **2. Create the model repository:** - $ mkdir -p model_repository/ensemble_python_resnet50/1 - $ mkdir -p model_repository/preprocess/1 - $ mkdir -p model_repository/resnet50_trt/1 + mkdir -p model_repository/ensemble_python_resnet50/1 + mkdir -p model_repository/preprocess/1 + mkdir -p model_repository/resnet50_trt/1 # Copy the Python model - $ cp model.py model_repository/preprocess/1 + cp model.py model_repository/preprocess/1 **3. Build a TensorRT engine for the ONNX model** Set the arguments for enabling fp16 precision --fp16. To enable dynamic shapes use --minShapes, --optShapes, and maxShapes with --explicitBatch: - $ trtexec --onnx=model.onnx --saveEngine=./model_repository/resnet50_trt/1/model.plan --explicitBatch --minShapes=input:1x3x224x224 --optShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --fp16 + trtexec --onnx=model.onnx --saveEngine=./model_repository/resnet50_trt/1/model.plan --explicitBatch --minShapes=input:1x3x224x224 --optShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --fp16 **4. Run the command below to start the server container:** Under python_backend/examples/preprocessing, run this command to start the server docker container: - $ docker run --gpus=all -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v$(pwd):/workspace/ -v/$(pwd)/model_repository:/models nvcr.io/nvidia/tritonserver:xx.yy-py3 bash - $ pip install numpy pillow torchvision - $ tritonserver --model-repository=/models + docker run --gpus=all -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v$(pwd):/workspace/ -v/$(pwd)/model_repository:/models nvcr.io/nvidia/tritonserver:xx.yy-py3 bash + pip install numpy pillow torchvision + tritonserver --model-repository=/models **5. Start the client to test:** Under python_backend/examples/preprocessing, run the commands below to start the client Docker container: - $ wget https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg -O "mug.jpg" - $ docker run --rm --net=host -v $(pwd):/workspace/ nvcr.io/nvidia/tritonserver:xx.yy-py3-sdk python client.py --image mug.jpg - $ The result of classification is:COFFEE MUG + wget https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg -O "mug.jpg" + docker run --rm --net=host -v $(pwd):/workspace/ nvcr.io/nvidia/tritonserver:xx.yy-py3-sdk python client.py --image mug.jpg + The result of classification is:COFFEE MUG Here, since we input an image of "mug" and the inference result is "COFFEE MUG" which is correct. diff --git a/inferentia/README.md b/inferentia/README.md index 6a90740d..381c8ed8 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -60,18 +60,18 @@ or simply clone with https. Clone this repo with Github to home repo `/home/ubuntu`. ``` - $chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh - $sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh ``` Then, start the Triton instance with: ``` - $docker run --device /dev/neuron0 -v /home/ubuntu/python_backend:/home/ubuntu/python_backend -v /lib/udev:/mylib/udev --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:-py3 + docker run --device /dev/neuron0 -v /home/ubuntu/python_backend:/home/ubuntu/python_backend -v /lib/udev:/mylib/udev --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:-py3 ``` Note 1: The user would need to list any neuron device to run during container initialization. For example, to use 4 neuron devices on an instance, the user would need to run with: ``` - $docker run --device /dev/neuron0 --device /dev/neuron1 --device /dev/neuron2 --device /dev/neuron3 ...` + docker run --device /dev/neuron0 --device /dev/neuron1 --device /dev/neuron2 --device /dev/neuron3 ...` ``` Note 2: `/mylib/udev` is used for Neuron parameter passing. @@ -81,7 +81,7 @@ Note 3: For Triton container version xx.yy, please refer to After starting the Triton container, go into the `python_backend` folder and run the setup script. ``` - $source /home/ubuntu/python_backend/inferentia/scripts/setup.sh + source /home/ubuntu/python_backend/inferentia/scripts/setup.sh ``` This script will: 1. Install necessary dependencies @@ -118,7 +118,7 @@ triton python model directory. An example invocation for the `gen_triton_model.py` for PyTorch model can look like: ``` - $python3 inferentia/scripts/gen_triton_model.py --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model /home/ubuntu/bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4 + python3 inferentia/scripts/gen_triton_model.py --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model /home/ubuntu/bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4 ``` In order for the script to treat the compiled model as TorchScript @@ -161,7 +161,7 @@ script to generate triton python model directory. An example invocation for the `gen_triton_model.py` for TensorFlow model can look like: ``` - $python3 gen_triton_model.py --model_type tensorflow --compiled_model /home/ubuntu/inferentia-poc-2.0/scripts-rn50-tf-native/resnet50_mlperf_opt_fp16_compiled_b5_nc1/1 --neuron_core_range 0:3 --triton_model_dir rn50-1neuroncores-bs1x1 + python3 gen_triton_model.py --model_type tensorflow --compiled_model /home/ubuntu/inferentia-poc-2.0/scripts-rn50-tf-native/resnet50_mlperf_opt_fp16_compiled_b5_nc1/1 --neuron_core_range 0:3 --triton_model_dir rn50-1neuroncores-bs1x1 ``` NOTE: Unlike TorchScript model, TensorFlow SavedModel stores sufficient @@ -215,7 +215,7 @@ a valid torchscript file or tensorflow savedmodel. Now, the server can be launched with the model as below: ``` - $tritonserver --model-repository + tritonserver --model-repository ``` Note: @@ -255,7 +255,7 @@ contains the necessary files to set up testing with a simple add_sub model. The requires an instance with more than 8 inferentia cores to run, eg:`inf1.6xlarge`. start the test, run ``` - $source /python_backend/inferentia/qa/setup_test_enviroment_and_test.sh + source /python_backend/inferentia/qa/setup_test_enviroment_and_test.sh ``` where `` is usually `/home/ubuntu`/. This script will pull the [server repo](https://github.com/triton-inference-server/server) @@ -265,7 +265,7 @@ Triton Server and Triton SDK. Note: If you would need to change some of the tests in the server repo, you would need to run ``` - $export TRITON_SERVER_REPO_TAG= + export TRITON_SERVER_REPO_TAG= ``` before running the script. @@ -273,8 +273,8 @@ before running the script. ## pytorch-neuronx and tensorflow-neuronx 1. Similar to the steps for inf1, change the argument to the pre-container and on-container setup scripts to include the `-inf2` or `-trn1`flags e.g., ``` - $chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh - $sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh -inf2 + chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh -inf2 ``` 2. On the container, followed by the `docker run` command, you can pass similar argument to the setup.sh script For Pytorch: From 8917c86a4f6face7b55319c6ca08dbd4378feef6 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 12 Mar 2024 00:58:01 +0530 Subject: [PATCH 15/54] Add Correlation Id string support for BLS (#344) * Add correlation id string support for BLS --- CMakeLists.txt | 4 +- README.md | 8 ++- src/correlation_id.cc | 120 ++++++++++++++++++++++++++++++++++++++++ src/correlation_id.h | 93 +++++++++++++++++++++++++++++++ src/infer_request.cc | 82 +++++++++++---------------- src/infer_request.h | 15 +++-- src/pb_stub.cc | 32 +++++++++-- src/python_be.cc | 19 +++++-- src/request_executor.cc | 13 ++++- 9 files changed, 316 insertions(+), 70 deletions(-) create mode 100644 src/correlation_id.cc create mode 100644 src/correlation_id.h diff --git a/CMakeLists.txt b/CMakeLists.txt index dacd0f9c..92b785bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -149,6 +149,8 @@ configure_file(src/libtriton_python.ldscript libtriton_python.ldscript COPYONLY) set( COMMON_SRCS + src/correlation_id.cc + src/correlation_id.h src/infer_response.cc src/infer_response.h src/infer_request.cc diff --git a/README.md b/README.md index 9182ae37..1b94d6b7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# **Preprocessing Using Python Backend Example** +# Preprocessing Using Python Backend Example This example shows how to preprocess your inputs using Python backend before it is passed to the TensorRT model for inference. This ensemble model includes an image preprocessing model (preprocess) and a TensorRT model (resnet50_trt) to do inference. **1. Converting PyTorch Model to ONNX format:** diff --git a/inferentia/README.md b/inferentia/README.md index 381c8ed8..fb0de4f7 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -34,7 +34,7 @@ and the [Neuron Runtime](https://awsdocs-neuron.readthedocs-hosted.com/en/latest ## Table of Contents -- [Using Triton with Inferentia](#using-triton-with-inferentia) +- [Using Triton with Inferentia 1](#using-triton-with-inferentia-1) - [Table of Contents](#table-of-contents) - [Inferentia setup](#inferentia-setup) - [Setting up the Inferentia model](#setting-up-the-inferentia-model) From 4d4211151d716e2a534ab1b8e8413d3c66967723 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 8 Apr 2024 13:22:08 -0400 Subject: [PATCH 20/54] Randomize Python backend shared memory region naming (#351) * Fix deprecated client package * Randomize Python backend shared memory region naming * Update docs --- README.md | 12 +++++++----- examples/preprocessing/client.py | 2 +- src/pb_utils.cc | 11 ++++++++++- src/pb_utils.h | 5 +++++ src/python_be.cc | 1 - src/stub_launcher.cc | 5 +---- 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 93fd212f..1bc9bd9b 100644 --- a/README.md +++ b/README.md @@ -1067,11 +1067,13 @@ will create additional threads instead of spawning separate processes. ## Running Multiple Instances of Triton Server -Python backend uses shared memory to transfer requests to the stub process. -When running multiple instances of Triton Server on the same machine that use -Python models, there would be shared memory region name conflicts that can -result in segmentation faults or hangs. In order to avoid this issue, you need -to specify different `shm-region-prefix-name` using the `--backend-config` flag. +Starting from 24.04 release, Python backend uses UUID to generate unique +names for Python backend shared memory regions so that multiple instances of +the server can run at the same time without any conflicts. + +If you're using a Python backend released before the 24.04 release, you need +to specify different `shm-region-prefix-name` using the `--backend-config` flag +to avoid conflicts between the shared memory regions. For example: ``` # Triton instance 1 diff --git a/examples/preprocessing/client.py b/examples/preprocessing/client.py index 202d411a..1ac107af 100644 --- a/examples/preprocessing/client.py +++ b/examples/preprocessing/client.py @@ -29,7 +29,7 @@ import sys import numpy as np -import tritongrpcclient +import tritonclient.grpc as tritongrpcclient def load_image(img_path: str): diff --git a/src/pb_utils.cc b/src/pb_utils.cc index 7bc17fa4..809531b8 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -314,4 +314,13 @@ WrapTritonErrorInSharedPtr(TRITONSERVER_Error* error) return response_error; } #endif // NOT TRITON_PB_STUB -}}} // namespace triton::backend::python + +std::string +GenerateUUID() +{ + static boost::uuids::random_generator generator; + boost::uuids::uuid uuid = generator(); + return boost::uuids::to_string(uuid); +} + +}}} // namespace triton::backend::python diff --git a/src/pb_utils.h b/src/pb_utils.h index 6d5f21ce..1a6c2d8b 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -32,6 +32,9 @@ #include #include +#include +#include +#include #include #include #include @@ -335,4 +338,6 @@ std::shared_ptr WrapTritonErrorInSharedPtr( TRITONSERVER_Error* error); #endif +std::string GenerateUUID(); + }}} // namespace triton::backend::python diff --git a/src/python_be.cc b/src/python_be.cc index 57e6cffd..b688fdfd 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -2131,7 +2131,6 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) backend_state->shm_growth_byte_size = 1 * 1024 * 1024; // 1 MB backend_state->stub_timeout_seconds = 30; backend_state->shm_message_queue_size = 1000; - backend_state->number_of_instance_inits = 0; backend_state->thread_pool_size = 32; // Initialize shared memory region prefix to include backend's name // to avoid collision between python backend and python-based backends. diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index a9956b55..9dc2a64a 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -78,12 +78,9 @@ StubLauncher::Initialize(ModelState* model_state) stub_pid_ = 0; #endif - // Atomically increase and read the stub process count to avoid shared memory - // region name collision - int num_init = ++model_state->StateForBackend()->number_of_instance_inits; shm_region_name_ = model_state->StateForBackend()->shared_memory_region_prefix + - std::to_string(num_init); + GenerateUUID(); model_version_ = model_state->Version(); From 0cdcaf3f0ff3fe2f0449c269a15b62899813ccd0 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Thu, 11 Apr 2024 10:55:39 -0700 Subject: [PATCH 21/54] Decoupled Async Execute (#350) * Add async decoupled execute * Enable decoupled bls async exec * Improve handling for async execute future object * Add docs for async execute for decoupled model * Fix link on docs * Improve docs wording * Improve destruction steps for async execute future object * Piggy back on GIL for protection * Document model should not modify event loop * Use Python add_done_callback * Protect infer_payload_ * Use traceback API that supports Python 3.8 and 3.9 * Update docs --- README.md | 21 +++++++++-- src/pb_stub.cc | 90 ++++++++++++++++++++++++++++++++++++++++++------ src/pb_stub.h | 7 +++- src/python_be.cc | 2 ++ src/python_be.h | 1 + 5 files changed, 107 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 1bc9bd9b..7f9c7027 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ any C++ code. - [Request Cancellation Handling](#request-cancellation-handling) - [Decoupled mode](#decoupled-mode) - [Use Cases](#use-cases) - - [Known Issues](#known-issues) + - [Async Execute](#async-execute) - [Request Rescheduling](#request-rescheduling) - [`finalize`](#finalize) - [Model Config File](#model-config-file) @@ -620,9 +620,24 @@ full power of what can be achieved from decoupled API. Read [Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md) for more details on how to host a decoupled model. -##### Known Issues +##### Async Execute -* Currently, decoupled Python models can not make async infer requests. +Starting from 24.04, `async def execute(self, requests):` is supported for +decoupled Python models. Its coroutine will be executed by an AsyncIO event loop +shared with requests executing in the same model instance. The next request for +the model instance can start executing while the current request is waiting. + +This is useful for minimizing the number of model instances for models that +spend the majority of its time waiting, given requests can be executed +concurrently by AsyncIO. To take full advantage of the concurrency, it is vital +for the async execute function to not block the event loop from making progress +while it is waiting, i.e. downloading over the network. + +Notes: +* The model should not modify the running event loop, as this might cause +unexpected issues. +* The server/backend do not control how many requests are added to the event +loop by a model instance. #### Request Rescheduling diff --git a/src/pb_stub.cc b/src/pb_stub.cc index a9a910a1..b12e249d 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -104,6 +104,32 @@ PyDefaultArgumentToMutableType(const py::object& argument) std::string(py::str(argument.get_type()))); } +void +AsyncEventFutureDoneCallback(const py::object& py_future) +{ + // TODO: Why using `py_future.result()` with error hangs on exit? + try { + py::object exception = py_future.attr("exception")(); + if (!py::isinstance(exception)) { + std::string err_msg = ""; + py::object traceback = py::module_::import("traceback") + .attr("TracebackException") + .attr("from_exception")(exception) + .attr("format")(); + for (py::handle line : traceback) { + err_msg += py::str(line); + } + LOG_ERROR << err_msg; + } + } + catch (const PythonBackendException& pb_exception) { + LOG_ERROR << pb_exception.what(); + } + catch (const py::error_already_set& error) { + LOG_ERROR << error.what(); + } +} + void Stub::Instantiate( int64_t shm_growth_size, int64_t shm_default_size, @@ -533,6 +559,8 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) c_python_backend_utils.attr("InferenceResponse")); c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); + async_event_loop_ = py::none(); + py::object TritonPythonModel = sys.attr("TritonPythonModel"); deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor"); serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor"); @@ -690,11 +718,18 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) py::object execute_return = model_instance_.attr("execute")(py_request_list); - if (!py::isinstance(execute_return)) { - throw PythonBackendException( - "Python model '" + name_ + - "' is using the decoupled mode and the execute function must " - "return None."); + bool is_coroutine = py::module::import("asyncio") + .attr("iscoroutine")(execute_return) + .cast(); + if (is_coroutine) { + RunCoroutine(execute_return); + } else { + if (!py::isinstance(execute_return)) { + throw PythonBackendException( + "Python model '" + name_ + + "' is using the decoupled mode and the execute function must " + "return None."); + } } } } @@ -870,6 +905,35 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) } } +py::object +Stub::GetAsyncEventLoop() +{ + if (py::isinstance(async_event_loop_)) { + // Create the event loop if not already. + py::module asyncio = py::module_::import("asyncio"); + async_event_loop_ = asyncio.attr("new_event_loop")(); + asyncio.attr("set_event_loop")(async_event_loop_); + py::object py_thread = + py::module_::import("threading") + .attr("Thread")( + "target"_a = async_event_loop_.attr("run_forever"), + "daemon"_a = true); + py_thread.attr("start")(); + } + return async_event_loop_; +} + +void +Stub::RunCoroutine(py::object coroutine) +{ + py::object loop = GetAsyncEventLoop(); + py::object py_future = py::module_::import("asyncio").attr( + "run_coroutine_threadsafe")(coroutine, loop); + py_future.attr("add_done_callback")( + py::module_::import("c_python_backend_utils") + .attr("async_event_future_done_callback")); +} + void Stub::UpdateHealth() { @@ -881,6 +945,10 @@ void Stub::Finalize() { finalizing_ = true; + // Stop async event loop if created. + if (!py::isinstance(async_event_loop_)) { + async_event_loop_.attr("stop")(); + } // Call finalize if exists. if (initialized_ && py::hasattr(model_instance_, "finalize")) { try { @@ -943,6 +1011,7 @@ Stub::~Stub() { py::gil_scoped_acquire acquire; + async_event_loop_ = py::none(); model_instance_ = py::none(); } stub_instance_.reset(); @@ -1729,11 +1798,6 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) [](std::shared_ptr& infer_request, const bool decoupled) { std::unique_ptr& stub = Stub::GetOrCreateInstance(); - if (stub->IsDecoupled()) { - throw PythonBackendException( - "Async BLS request execution is not support in the decoupled " - "API."); - } py::object loop = py::module_::import("asyncio").attr("get_running_loop")(); py::cpp_function callback = [&stub, infer_request, decoupled]() { @@ -1860,6 +1924,12 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) "is_model_ready", &IsModelReady, py::arg("model_name").none(false), py::arg("model_version").none(false) = ""); + // This function is not part of the public API for Python backend. This is + // only used for internal callbacks. + module.def( + "async_event_future_done_callback", &AsyncEventFutureDoneCallback, + py::arg("py_future").none(false)); + // This class is not part of the public API for Python backend. This is only // used for internal testing purposes. py::class_(module, "SharedMemory") diff --git a/src/pb_stub.h b/src/pb_stub.h index a51f25f5..c9462fd0 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -255,6 +255,10 @@ class Stub { void ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr); + py::object GetAsyncEventLoop(); + + void RunCoroutine(py::object coroutine); + /// Get the memory manager message queue std::unique_ptr>& MemoryManagerQueue(); @@ -363,6 +367,7 @@ class Stub { py::object model_instance_; py::object deserialize_bytes_; py::object serialize_bytes_; + py::object async_event_loop_; std::unique_ptr> stub_message_queue_; std::unique_ptr> diff --git a/src/python_be.cc b/src/python_be.cc index b688fdfd..b95fb715 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -768,6 +768,7 @@ ModelInstanceState::ExecuteBLSRequest( if (is_decoupled && (infer_response->Id() != nullptr)) { // Need to manage the lifetime of InferPayload object for bls // decoupled responses. + std::lock_guard lock(infer_payload_mu_); infer_payload_[reinterpret_cast(infer_payload.get())] = infer_payload; } @@ -961,6 +962,7 @@ ModelInstanceState::ProcessCleanupRequest( intptr_t id = reinterpret_cast(cleanup_message_ptr->id); if (message->Command() == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { // Remove the InferPayload object from the map. + std::lock_guard lock(infer_payload_mu_); infer_payload_.erase(id); } else if (message->Command() == PYTHONSTUB_DecoupledResponseFactoryCleanup) { // Delete response factory diff --git a/src/python_be.h b/src/python_be.h index 4430767c..9618204c 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -296,6 +296,7 @@ class ModelInstanceState : public BackendModelInstance { std::vector> futures_; std::unique_ptr thread_pool_; std::unordered_map> infer_payload_; + std::mutex infer_payload_mu_; std::unique_ptr request_executor_; public: From ad4a44014dda78c2df48b3209dd23bb016a24369 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 16 Apr 2024 11:31:26 -0700 Subject: [PATCH 22/54] Reset async_event_loop_ only if initialized (#354) --- src/pb_stub.cc | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index b12e249d..56d466f5 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -945,17 +945,19 @@ void Stub::Finalize() { finalizing_ = true; - // Stop async event loop if created. - if (!py::isinstance(async_event_loop_)) { - async_event_loop_.attr("stop")(); - } - // Call finalize if exists. - if (initialized_ && py::hasattr(model_instance_, "finalize")) { - try { - model_instance_.attr("finalize")(); + if (initialized_) { + // Stop async event loop if created. + if (!py::isinstance(async_event_loop_)) { + async_event_loop_.attr("stop")(); } - catch (const py::error_already_set& e) { - LOG_INFO << e.what(); + // Call finalize if exists. + if (py::hasattr(model_instance_, "finalize")) { + try { + model_instance_.attr("finalize")(); + } + catch (const py::error_already_set& e) { + LOG_INFO << e.what(); + } } } #ifdef TRITON_ENABLE_GPU From b7a069083ecf16020d7144fa596a6ed8f36559b6 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 16 Apr 2024 18:12:05 -0400 Subject: [PATCH 23/54] Add vscode configurations to make development easier (#352) * Add vscode configurations to make development easier * Review comment * Fix merge conflict * Fix permission * Update dockerfile * Fix username * Review comments * Add link * Review edit --- .devcontainer/Dockerfile | 48 +++++++++++++++++++ .devcontainer/devcontainer.json | 26 ++++++++++ .gitignore | 1 - .vscode/tasks.json | 85 +++++++++++++++++++++++++++++++++ README.md | 12 +++++ 5 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json create mode 100644 .vscode/tasks.json diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 00000000..737725bb --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,48 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +FROM nvcr.io/nvidia/tritonserver:24.03-py3 + +ARG USERNAME=triton-server + +RUN apt-get update \ + && apt-get install -y sudo + +RUN pip3 install transformers torch + +# Create the user +RUN apt-get update \ + && apt-get install -y sudo \ + && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ + && chmod 0440 /etc/sudoers.d/$USERNAME + +RUN pip3 install pre-commit ipdb + +RUN mkhomedir_helper triton-server + +RUN apt-get install -y cmake rapidjson-dev + +USER ${USERNAME} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..e1b8bd10 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,26 @@ +{ + "name": "Python Backend", + + "build": { + "dockerfile": "Dockerfile" + }, + "customizations": { + "vscode": { + "extensions": [ + "ms-python.vscode-pylance", + "ms-python.python", + "ms-vscode.cpptools-extension-pack", + "ms-vscode.cmake-tools", + "github.vscode-pull-request-github" + ] + } + }, + "postCreateCommand": "sudo chown -R triton-server:triton-server ~/.cache", + + "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined", "--gpus=all", "--shm-size=2g", "--ulimit", "stack=67108864" ], + "mounts": [ + "source=${localEnv:HOME}/.ssh,target=/home/triton-server/.ssh,type=bind,consistency=cached", + "source=${localEnv:HOME}/.cache/huggingface,target=/home/triton-server/.cache/huggingface,type=bind,consistency=cached" + ], + "remoteUser": "triton-server" +} diff --git a/.gitignore b/.gitignore index bf7e1686..293f6455 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ /build -/.vscode *.so builddir diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 00000000..597a746d --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,85 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Configure", + "type": "shell", + "command": "cmake", + "args": [ + "-DCMAKE_INSTALL_PREFIX:STRING=/opt/tritonserver/", + "-DTRITON_COMMON_REPO_TAG:STRING=main", + "-DTRITON_BACKEND_REPO_TAG:STRING=main", + "-DTRITON_CORE_REPO_TAG:STRING=main", + "-DTRITON_ENABLE_GPU:STRING=ON", + "-DTRITON_ENABLE_NVTX:STRING=ON", + "-DCMAKE_INSTALL_PREFIX:STRING=${workspaceFolder}/build/install", + "-DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE", + "-DCMAKE_BUILD_TYPE:STRING=Debug", + "-DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc", + "-DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++", + "-S${workspaceFolder}", + "-B${workspaceFolder}/build", + "-G", + "Unix Makefiles" + ], + "problemMatcher": [] + }, + { + "label": "Build", + "type": "shell", + "command": "cmake", + "args": [ + "--build", + "/${workspaceFolder}/build", + "--config", + "Debug", + "--target", + "all", + "-j", + "18", + "--" + ] + }, + { + "label": "Install", + "type": "shell", + "command": "cmake", + "args": [ + "--build", + "${workspaceFolder}/build", + "--config", + "Debug", + "--target", + "install", + "-j", + "18", + "--" + ] + }, + { + "label": "Move", + "type": "shell", + "command": "sudo", + "args": [ + "cp", + "-r", + "${workspaceFolder}/build/install/backends/python/*", + "/opt/tritonserver/backends/python" + ] + }, + { + "label": "Build Python Backend", + "dependsOrder": "sequence", + "dependsOn": [ + "Configure", + "Build", + "Install", + "Move" + ], + "group": { + "kind": "build", + "isDefault": true + } + } + ] +} diff --git a/README.md b/README.md index 7f9c7027..89b9213e 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,7 @@ any C++ code. - [Custom Metrics](#custom-metrics-1) - [Running with Inferentia](#running-with-inferentia) - [Logging](#logging) +- [Development with VSCode](#development-with-vscode) - [Reporting problems, asking questions](#reporting-problems-asking-questions) ## Quick Start @@ -1825,6 +1826,17 @@ def initialize(self, args): # Should print {'custom_key': {'string_value': 'custom_value'}} ``` +# Development with VSCode + +The repository includes a `.devcontainer` folder that contains a `Dockerfile` +and `devcontainer.json` file to help you develop the Python backend +using +[Visual Studio Code](https://code.visualstudio.com/docs/devcontainers/containers). + +In order to build the backend, you can execute the "Build Python Backend" task in the +[VSCode tasks](https://code.visualstudio.com/docs/editor/tasks). This will build +the Python backend and install the artifacts in +`/opt/tritonserver/backends/python`. # Reporting problems, asking questions From 9d2c513d41368d4932ea3e6207cbb248d5d8c9ee Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 8 May 2024 20:11:45 -0700 Subject: [PATCH 24/54] Add error handling in case of AutocompleteStub Failure for DLIS-5819 (#356) * DLIS-5819 * Guard WaitForStubProcess in case of failed auto-complete-config --- src/stub_launcher.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 9dc2a64a..828228e6 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -787,7 +787,11 @@ StubLauncher::WaitForStubProcess() CloseHandle(stub_pid_.hThread); #else int status; - waitpid(stub_pid_, &status, 0); + if (stub_pid_ != 0) { + // Added this check to ensure server doesn't hang waiting after stub + // process has already be killed and cannot be waited on + waitpid(stub_pid_, &status, 0); + } #endif } From 27f04d10abb4e7d924ebb6ca4f97de923a2e4fa4 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Wed, 5 Jun 2024 17:53:58 -0700 Subject: [PATCH 25/54] Add support for response sender in the default mode (#364) * Add response sender to non-decoupled models and unify data pipelines (#360) * Add response sender to non-decoupled model and unify data pipelines * Rename variable and class name * Fix decoupled batch statistics to account for implicit batch size (#361) * Fix decoupled gpu output error handling (#362) * Fix decoupled gpu output error handling * Return full error string upon exception from model * Response sender to check for improper non-decoupled model usage (#363) * Response sender to check for improper non-decoupled model usage * Force close response sender on exception * Rename functions --- README.md | 6 + src/infer_request.cc | 19 +- src/infer_request.h | 5 +- src/pb_stub.cc | 255 ++++++------------ src/pb_stub.h | 7 +- src/python_be.cc | 570 ++--------------------------------------- src/python_be.h | 45 +--- src/response_sender.cc | 129 +++++++--- src/response_sender.h | 18 +- 9 files changed, 241 insertions(+), 813 deletions(-) diff --git a/README.md b/README.md index 89b9213e..30f2dd25 100644 --- a/README.md +++ b/README.md @@ -479,6 +479,12 @@ Upon return from the execute function all tensor data associated with the InferenceRequest objects passed to the function are deleted, and so InferenceRequest objects should not be retained by the Python model. +Starting from 24.06, models may choose to send the response using the +`InferenceResponseSender` as illustrated on [Decoupled mode](#decoupled-mode). +Since the model is in default mode, it must send exactly one response per +request. The `pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag must be sent +either with the response or as a flag only response afterward. + #### Error Handling In case one of the requests has an error, you can use the `TritonError` object diff --git a/src/infer_request.cc b/src/infer_request.cc index 31182281..57ea6cf1 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -74,7 +74,7 @@ InferRequest::InferRequest( pb_cancel_ = std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( - request_address_, response_factory_address_, + request_address_, response_factory_address_, nullptr /* is_decoupled */, Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); #endif } @@ -272,7 +272,8 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) std::unique_ptr InferRequest::LoadFromSharedMemory( std::unique_ptr& shm_pool, - bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle) + bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle, + bool const* is_model_decoupled) { AllocatedSharedMemory infer_request_shm = shm_pool->Load(request_handle); @@ -328,7 +329,7 @@ InferRequest::LoadFromSharedMemory( return std::unique_ptr(new InferRequest( infer_request_shm, request_id_shm, correlation_id_shm, requested_output_names_shm, model_name_shm, input_tensors, parameters_shm, - infer_trace_shm)); + infer_trace_shm, is_model_decoupled)); } InferRequest::InferRequest( @@ -339,7 +340,8 @@ InferRequest::InferRequest( std::unique_ptr& model_name_shm, std::vector>& input_tensors, std::unique_ptr& parameters_shm, - std::unique_ptr& infer_trace_shm) + std::unique_ptr& infer_trace_shm, + bool const* is_model_decoupled) : infer_request_shm_(std::move(infer_request_shm)), request_id_shm_(std::move(request_id_shm)), requested_output_names_shm_(std::move(requested_output_names_shm)), @@ -387,7 +389,7 @@ InferRequest::InferRequest( pb_cancel_ = std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( - request_address_, response_factory_address_, + request_address_, response_factory_address_, is_model_decoupled, Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); #endif } @@ -402,13 +404,6 @@ InferRequest::IsCancelled() std::shared_ptr InferRequest::GetResponseSender() { - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - if (!stub->IsDecoupled()) { - throw PythonBackendException( - "'get_response_sender' function must be called only when the model is " - "using the decoupled transaction policy."); - } - return response_sender_; } diff --git a/src/infer_request.h b/src/infer_request.h index e0887624..c67e2fb0 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -118,7 +118,7 @@ class InferRequest { static std::unique_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t request_handle, - bool open_cuda_handle); + bool open_cuda_handle, bool const* is_model_decoupled); /// Disallow copying the inference request object. DISALLOW_COPY_AND_ASSIGN(InferRequest); @@ -135,7 +135,8 @@ class InferRequest { std::unique_ptr& model_name_shm, std::vector>& input_tensors, std::unique_ptr& parameters_shm, - std::unique_ptr& infer_trace_shm); + std::unique_ptr& infer_trace_shm, + bool const* is_model_decoupled); std::string request_id_; CorrelationId correlation_id_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 56d466f5..87410a70 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -402,11 +402,7 @@ Stub::RunCommand() shm_pool_->Load(ipc_message->Args()); RequestBatch* request_batch_shm_ptr = reinterpret_cast(request_batch.data_.get()); - if (!ipc_control_->decoupled) { - ProcessRequests(request_batch_shm_ptr); - } else { - ProcessRequestsDecoupled(request_batch_shm_ptr); - } + ProcessRequests(request_batch_shm_ptr); } break; case PYTHONSTUB_CommandType::PYTHONSTUB_FinalizeRequest: @@ -597,18 +593,6 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) initialized_ = true; } -void -Stub::ProcessResponse(InferResponse* response) -{ - response->SaveToSharedMemory(shm_pool_, false /* copy_gpu */); - - for (auto& output_tensor : response->OutputTensors()) { - if (!output_tensor->IsCPU()) { - gpu_tensors_.push_back(output_tensor); - } - } -} - void Stub::LoadGPUBuffers(std::unique_ptr& ipc_message) { @@ -674,7 +658,8 @@ Stub::LoadRequestsFromSharedMemory(RequestBatch* request_batch_shm_ptr) for (size_t i = 0; i < batch_size; i++) { std::shared_ptr infer_request = InferRequest::LoadFromSharedMemory( - shm_pool_, request_shm_handle[i], true /* open_cuda_handle */); + shm_pool_, request_shm_handle[i], true /* open_cuda_handle */, + &ipc_control_->decoupled /* is_model_decoupled */); py_request_list.append(infer_request); } @@ -682,7 +667,7 @@ Stub::LoadRequestsFromSharedMemory(RequestBatch* request_batch_shm_ptr) } void -Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) +Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) { py::list py_request_list = LoadRequestsFromSharedMemory(request_batch_shm_ptr); @@ -718,18 +703,21 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) py::object execute_return = model_instance_.attr("execute")(py_request_list); + bool is_coroutine = py::module::import("asyncio") .attr("iscoroutine")(execute_return) .cast(); if (is_coroutine) { - RunCoroutine(execute_return); - } else { - if (!py::isinstance(execute_return)) { - throw PythonBackendException( - "Python model '" + name_ + - "' is using the decoupled mode and the execute function must " - "return None."); + if (IsDecoupled()) { + // Do not wait for async decoupled execute to return. + RunCoroutine(execute_return, true /* in_background */); + } else { + py::object coroutine_return = + RunCoroutine(execute_return, false /* in_background */); + ProcessReturnedResponses(py_request_list, coroutine_return); } + } else { + ProcessReturnedResponses(py_request_list, execute_return); } } } @@ -748,160 +736,77 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) "Failed to process the request(s) for model '" + name_ + "', message: ") + error_string; - LOG_INFO << err_message.c_str(); + LOG_ERROR << err_message.c_str(); response_batch_shm_ptr->has_error = true; - error_string_shm = PbString::Create(shm_pool_, error_string); + error_string_shm = PbString::Create(shm_pool_, err_message); response_batch_shm_ptr->error = error_string_shm->ShmHandle(); response_batch_shm_ptr->is_error_set = true; + // Once the error is sent to the backend, the backend is supposed to close + // all response factories if not already closed, so closing all response + // senders if not already closed to prevent the model from sending more + // responses after the factories are closed. + for (py::handle py_request : py_request_list) { + InferRequest* request = py_request.cast(); + request->GetResponseSender()->Close(); + } } } void -Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) +Stub::ProcessReturnedResponses( + py::list py_requests, py::object py_responses_obj) { - std::unique_ptr execute_response = - IPCMessage::Create(shm_pool_, false /* Inline response */); - execute_response->Command() = PYTHONSTUB_ExecuteResponse; - - AllocatedSharedMemory response_batch = shm_pool_->Construct( - request_batch_shm_ptr->batch_size * - sizeof(bi::managed_external_buffer::handle_t) + - sizeof(ResponseBatch)); - ResponseBatch* response_batch_shm_ptr = - reinterpret_cast(response_batch.data_.get()); - - std::unique_ptr error_string_shm; - py::list inference_responses; - - bi::managed_external_buffer::handle_t* responses_shm_handle = - reinterpret_cast( - response_batch.data_.get() + sizeof(ResponseBatch)); - - py::list responses; - - // Notifying the stub should be after responses. - ScopedDefer execute_finalize([this] { stub_message_queue_->Pop(); }); - ScopedDefer _( - [this, &execute_response] { SendIPCMessage(execute_response); }); - - execute_response->Args() = response_batch.handle_; - - bool has_exception = false; - std::string error_string; - try { - response_batch_shm_ptr->has_error = false; - response_batch_shm_ptr->is_error_set = false; - - uint32_t batch_size = request_batch_shm_ptr->batch_size; - - if (batch_size == 0) { - return; - } - - py::list py_request_list = - LoadRequestsFromSharedMemory(request_batch_shm_ptr); - - if (!py::hasattr(model_instance_, "execute")) { - std::string message = "Python model " + model_context_.PythonModelPath() + - " does not implement `execute` method."; - throw PythonBackendException(message); - } - - py::object request_list = py_request_list; - py::module asyncio = py::module::import("asyncio"); - - // Execute Response - py::object execute_return; - py::object responses_obj; - bool is_coroutine; - - { - NVTX_RANGE(nvtx_, "PyExecute " + name_); - execute_return = model_instance_.attr("execute")(request_list); - is_coroutine = asyncio.attr("iscoroutine")(execute_return).cast(); - } - - if (is_coroutine) { - responses_obj = asyncio.attr("run")(execute_return); - } else { - responses_obj = execute_return; - } - - // Check the return type of execute function. - if (!py::isinstance(responses_obj)) { - std::string str = py::str(execute_return.get_type()); - throw PythonBackendException( - std::string("Expected a list in the execute return, found type '") + - str + "'."); - } - - responses = responses_obj; - size_t response_size = py::len(responses); - - // If the number of request objects do not match the number of - // response objects throw an error. - if (response_size != batch_size) { - std::string err = - "Number of InferenceResponse objects do not match the number " - "of " - "InferenceRequest objects. InferenceRequest(s) size is:" + - std::to_string(batch_size) + ", and InferenceResponse(s) size is:" + - std::to_string(response_size) + "\n"; - throw PythonBackendException(err); - } - - for (size_t i = 0; i < response_size; i++) { - // Check the return type of execute function. - InferRequest* infer_request = py_request_list[i].cast(); - if (infer_request->ReleaseFlags() == - TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { - if (!py::isinstance(responses[i])) { - // When the request is rescheduled in non-decoupled model, the - // response must be None. - std::string str = py::str(responses[i].get_type()); - throw PythonBackendException( - "Expected a None object in the execute function return list for " - "reschduled request, " - "found type '" + - str + "'."); - } - } else { - if (!py::isinstance(responses[i])) { - std::string str = py::str(responses[i].get_type()); - throw PythonBackendException( - std::string( - "Expected an 'InferenceResponse' object in the execute " - "function return list, found type '") + - str + "'."); - } - InferResponse* infer_response = responses[i].cast(); - infer_response->PruneOutputTensors( - infer_request->RequestedOutputNames()); - ProcessResponse(infer_response); - responses_shm_handle[i] = infer_response->ShmHandle(); - } - } - response_batch_shm_ptr->batch_size = response_size; + // Return if there is nothing to process. + if (py::isinstance(py_responses_obj)) { + return; } - catch (const PythonBackendException& pb_exception) { - has_exception = true; - error_string = pb_exception.what(); + // Only non-decoupled may return responses. + if (IsDecoupled()) { + throw PythonBackendException( + "Python model '" + name_ + + "' is using the decoupled mode and the execute function must return " + "None."); } - catch (const py::error_already_set& error) { - has_exception = true; - error_string = error.what(); + // Check responses is a list. + if (!py::isinstance(py_responses_obj)) { + throw PythonBackendException( + "Expected a list in the execute return, found type '" + + std::string(py::str(py_responses_obj.get_type())) + "'."); + } + py::list py_responses = py_responses_obj; + // Responses and requests length must match. + size_t requests_size = py::len(py_requests); + size_t responses_size = py::len(py_responses); + if (requests_size != responses_size) { + throw PythonBackendException( + "Number of InferenceResponse objects do not match the number of " + "InferenceRequest objects. InferenceRequest(s) size is:" + + std::to_string(requests_size) + ", and InferenceResponse(s) size is:" + + std::to_string(responses_size) + "\n"); } - if (has_exception) { - std::string err_message = - std::string( - "Failed to process the request(s) for model '" + name_ + - "', message: ") + - error_string; - error_string_shm = PbString::Create(shm_pool_, error_string); - response_batch_shm_ptr->has_error = true; - response_batch_shm_ptr->is_error_set = true; - response_batch_shm_ptr->error = error_string_shm->ShmHandle(); + for (size_t i = 0; i < responses_size; i++) { + if (!py::isinstance(py_responses[i])) { + InferRequest* request = py_requests[i].cast(); + // Response must be None if rescheduled. + if (request->ReleaseFlags() == TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + throw PythonBackendException( + "Expected a None object in the execute function return list for " + "reschduled request, found type '" + + std::string(py::str(py_responses[i].get_type())) + "'."); + } + // Send the response. + if (!py::isinstance(py_responses[i])) { + throw PythonBackendException( + "Expected an 'InferenceResponse' object in the execute function " + "return list, found type '" + + std::string(py::str(py_responses[i].get_type())) + "'."); + } + std::shared_ptr response = + py_responses[i].cast>(); + request->GetResponseSender()->Send( + response, TRITONSERVER_RESPONSE_COMPLETE_FINAL); + } } } @@ -923,15 +828,19 @@ Stub::GetAsyncEventLoop() return async_event_loop_; } -void -Stub::RunCoroutine(py::object coroutine) +py::object +Stub::RunCoroutine(py::object coroutine, bool in_background) { py::object loop = GetAsyncEventLoop(); py::object py_future = py::module_::import("asyncio").attr( "run_coroutine_threadsafe")(coroutine, loop); - py_future.attr("add_done_callback")( - py::module_::import("c_python_backend_utils") - .attr("async_event_future_done_callback")); + if (in_background) { + py_future.attr("add_done_callback")( + py::module_::import("c_python_backend_utils") + .attr("async_event_future_done_callback")); + return py::none(); + } + return py_future.attr("result")(); } void diff --git a/src/pb_stub.h b/src/pb_stub.h index c9462fd0..10e7606a 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -253,11 +253,12 @@ class Stub { /// Execute a batch of requests. void ProcessRequests(RequestBatch* request_batch_shm_ptr); - void ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr); + void ProcessReturnedResponses( + py::list py_requests, py::object py_responses_obj); py::object GetAsyncEventLoop(); - void RunCoroutine(py::object coroutine); + py::object RunCoroutine(py::object coroutine, bool in_background); /// Get the memory manager message queue std::unique_ptr>& MemoryManagerQueue(); @@ -265,8 +266,6 @@ class Stub { /// Get the shared memory pool std::unique_ptr& ShmPool() { return shm_pool_; } - void ProcessResponse(InferResponse* response); - void ProcessBLSResponseDecoupled(std::unique_ptr& ipc_message); void LoadGPUBuffers(std::unique_ptr& ipc_message); diff --git a/src/python_be.cc b/src/python_be.cc index b95fb715..cd31e79e 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -153,124 +153,6 @@ ModelInstanceState::SetErrorForResponseSendMessage( } } -void -ModelInstanceState::SendMessageAndReceiveResponse( - bi::managed_external_buffer::handle_t message, - bi::managed_external_buffer::handle_t& response, bool& restart, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - auto error = SendMessageToStub(message); - if (error != nullptr) { - restart = true; - RespondErrorToAllRequests( - TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - - return; - } - - bi::managed_external_buffer::handle_t response_message; - error = Stub()->ReceiveMessageFromStub(response_message); - if (error != nullptr) { - restart = true; - RespondErrorToAllRequests( - TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - - return; - } - - response = response_message; -} - -TRITONSERVER_Error* -ModelInstanceState::SendMessageToStub( - bi::managed_external_buffer::handle_t message) -{ - bool success = false; - while (!success) { - uint64_t timeout_miliseconds = 1000; - { - boost::posix_time::ptime timeout = - boost::get_system_time() + - boost::posix_time::milliseconds(timeout_miliseconds); - - bi::scoped_lock lock( - *(Stub()->HealthMutex()), timeout); - - // Check if lock has been acquired. - if (lock) { - Stub()->IpcControl()->stub_health = false; - } else { - // If it failed to obtain the lock, it means that the stub has been - // stuck or exited while holding the health mutex lock. - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex."); - } - } - - Stub()->StubMessageQueue()->Push( - message, timeout_miliseconds /* duration ms */, success); - - if (!success && !IsStubProcessAlive()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Stub process is not healthy."); - } - } - - return nullptr; // success -} - -void -ModelInstanceState::RespondErrorToAllRequests( - const char* message, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - for (uint32_t r = 0; r < request_count; ++r) { - if ((*responses)[r] == nullptr) - continue; - - std::string err_message = - std::string( - "Failed to process the request(s) for model instance '" + Name() + - "', message: ") + - message; - - TRITONSERVER_Error* err = - TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message.c_str()); - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - - (*responses)[r] = nullptr; - TRITONSERVER_ErrorDelete(err); - } -} - -void -ModelInstanceState::WaitForBLSRequestsToFinish() -{ - futures_.clear(); -} - -bool -ModelInstanceState::IsStubProcessAlive() -{ - boost::posix_time::ptime timeout = - boost::get_system_time() + boost::posix_time::seconds(1); - bi::scoped_lock lock(*Stub()->HealthMutex(), timeout); - - // Check if lock has been acquired. - if (lock) { - return Stub()->IpcControl()->stub_health; - } else { - // If It failed to obtain the lock, it means that the stub has been - // stuck or exited while holding the health mutex lock. - return false; - } -} - TRITONSERVER_Error* ModelInstanceState::SaveRequestsToSharedMemory( TRITONBACKEND_Request** requests, const uint32_t request_count, @@ -408,24 +290,15 @@ ModelInstanceState::SaveRequestsToSharedMemory( request, &request_timeout)); std::unique_ptr infer_request; - if (model_state->IsDecoupled()) { - TRITONBACKEND_ResponseFactory* factory_ptr; - RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); - - infer_request = std::make_unique( - id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), parameters_string, flags, - request_timeout, reinterpret_cast(factory_ptr), - reinterpret_cast(request), - PreferredMemory(PreferredMemory::kDefault, 0), trace); - } else { - infer_request = std::make_unique( - id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), parameters_string, flags, - request_timeout, 0 /* response_factory_address */, - reinterpret_cast(request), - PreferredMemory(PreferredMemory::kDefault, 0), trace); - } + TRITONBACKEND_ResponseFactory* factory_ptr; + RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); + + infer_request = std::make_unique( + id, correlation_id, pb_input_tensors, requested_output_names, + model_state->Name(), model_state->Version(), parameters_string, flags, + request_timeout, reinterpret_cast(factory_ptr), + reinterpret_cast(request), + PreferredMemory(PreferredMemory::kDefault, 0), trace); RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool())); requests_shm[r] = infer_request->ShmHandle(); pb_infer_requests.emplace_back(std::move(infer_request)); @@ -449,11 +322,8 @@ ModelInstanceState::LaunchStubProcess() thread_pool_ = std::make_unique( model_state->StateForBackend()->thread_pool_size); - if (model_state->IsDecoupled()) { - decoupled_thread_ = true; - decoupled_monitor_ = - std::thread(&ModelInstanceState::DecoupledMessageQueueMonitor, this); - } + queue_monitor_thread_ = true; + queue_monitor_ = std::thread(&ModelInstanceState::MessageQueueMonitor, this); request_executor_ = std::make_unique( Stub()->ShmPool(), model_state->TritonServer()); @@ -700,7 +570,8 @@ ModelInstanceState::ExecuteBLSRequest( reinterpret_cast( request_batch.data_.get() + sizeof(RequestBatch)); infer_request = InferRequest::LoadFromSharedMemory( - Stub()->ShmPool(), *request_handle, false /* open_cuda_handle */); + Stub()->ShmPool(), *request_handle, false /* open_cuda_handle */, + nullptr /* is_model_decoupled */); // If the BLS inputs are in GPU an additional round trip between the // stub process and the main process is required. The reason is that we @@ -806,9 +677,9 @@ ModelInstanceState::ExecuteBLSRequest( } void -ModelInstanceState::DecoupledMessageQueueMonitor() +ModelInstanceState::MessageQueueMonitor() { - while (decoupled_thread_) { + while (queue_monitor_thread_) { bi::managed_external_buffer::handle_t handle = Stub()->ParentMessageQueue()->Pop(); if (handle == DUMMY_MESSAGE) { @@ -1306,7 +1177,7 @@ ModelInstanceState::ResponseSendDecoupled( } TRITONSERVER_Error* -ModelInstanceState::ProcessRequestsDecoupled( +ModelInstanceState::ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector>& pb_infer_requests, PbMetricReporter& reporter) @@ -1365,7 +1236,7 @@ ModelInstanceState::ProcessRequestsDecoupled( uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); reporter.SetComputeEndNs(compute_end_ns); - reporter.SetBatchStatistics(request_count); + reporter.SetBatchStatistics(total_batch_size); if (response_batch.data_->has_error) { if (response_batch.data_->is_error_set) { @@ -1382,364 +1253,6 @@ ModelInstanceState::ProcessRequestsDecoupled( return nullptr; // success } -void -ModelInstanceState::ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_infer_requests, - bool& restart) -{ - NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); - ModelState* model_state = reinterpret_cast(Model()); - std::string name = model_state->Name(); - - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("model ") + model_state->Name() + ", instance " + Name() + - ", executing " + std::to_string(request_count) + " requests") - .c_str()); - - uint64_t exec_start_ns = 0; - SET_TIMESTAMP(exec_start_ns); - - // We take the responsibility of the responses. - std::shared_ptr> responses( - new std::vector()); - responses->reserve(request_count); - PbMetricReporter reporter( - TritonModelInstance(), requests, request_count, responses); - reporter.SetExecStartNs(exec_start_ns); - - for (size_t i = 0; i < request_count; i++) { - TRITONBACKEND_Response* response; - auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); - if (err == nullptr) { - responses->emplace_back(response); - } else { - responses->emplace_back(nullptr); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); - TRITONSERVER_ErrorDelete(err); - } - } - - size_t total_batch_size = 0; - RESPOND_ALL_AND_RETURN_IF_ERROR( - responses, request_count, - CheckIncomingRequests(requests, request_count, total_batch_size)); - - // No request to process - if (total_batch_size == 0) { - return; - } - - // Wait for all the pending BLS requests to be completed. - ScopedDefer bls_defer([this] { WaitForBLSRequestsToFinish(); }); - AllocatedSharedMemory request_batch; - RESPOND_ALL_AND_RETURN_IF_ERROR( - responses, request_count, - SaveRequestsToSharedMemory( - requests, request_count, pb_infer_requests, request_batch, - responses)); - - std::shared_ptr ipc_message = - IPCMessage::Create(Stub()->ShmPool(), false /*inline_response*/); - ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; - ipc_message->Args() = request_batch.handle_; - - uint64_t compute_start_ns = 0; - SET_TIMESTAMP(compute_start_ns); - reporter.SetComputeStartNs(compute_start_ns); - - // This means that the stub process has exited and Python - // backend failed to restart the stub process. - if (!Stub()->StubActive()) { - const char* error_message = "The stub process has exited unexpectedly."; - RespondErrorToAllRequests( - error_message, responses, requests, request_count); - return; - } - - bi::managed_external_buffer::handle_t response_message; - { - NVTX_RANGE(nvtx_, "StubProcessing " + Name()); - SendMessageAndReceiveResponse( - ipc_message->ShmHandle(), response_message, restart, responses, - requests, request_count); - } - - ScopedDefer execute_finalize([this, &restart] { - // Push a dummy message to the message queue so that - // the stub process is notified that it can release - // the object stored in shared memory. - NVTX_RANGE(nvtx_, "RequestExecuteFinalize " + Name()); - if (!restart) - // Push a dummy message to signal the thread to terminate. - Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); - }); - if (restart) { - return; - } - - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - ipc_message = IPCMessage::LoadFromSharedMemory( - Stub()->ShmPool(), response_message)); - - // If the stub command is no longer PYTHONSTUB_InferExecRequest, it indicates - // that inference request execution has finished and there are no more BLS - // requests to execute. Otherwise, the Python backend will continuously - // execute BLS requests pushed to the message queue. - while (ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest || - ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest) { - std::packaged_task task([this, ipc_message] { - ExecuteBLSRequest( - ipc_message, - (ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest)); - }); - std::future future = - boost::asio::post(*thread_pool_, std::move(task)); - futures_.emplace_back(std::move(future)); - - auto error = Stub()->ReceiveMessageFromStub(response_message); - if (error != nullptr) { - restart = true; - RespondErrorToAllRequests( - TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - return; - } - - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - ipc_message = IPCMessage::LoadFromSharedMemory( - Stub()->ShmPool(), response_message)); - } - - uint64_t compute_end_ns = 0; - SET_TIMESTAMP(compute_end_ns); - reporter.SetComputeEndNs(compute_end_ns); - - // Parsing the request response - AllocatedSharedMemory response_batch; - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - response_batch = Stub()->ShmPool()->Load(ipc_message->Args())); - - ResponseBatch* response_batch_shm_ptr = - reinterpret_cast(response_batch.data_.get()); - - // If inference fails, release all the requests and send an error response. - // If inference fails at this stage, it usually indicates a bug in the model - // code - if (response_batch_shm_ptr->has_error) { - if (response_batch_shm_ptr->is_error_set) { - std::unique_ptr error_message_shm; - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - error_message_shm = PbString::LoadFromSharedMemory( - Stub()->ShmPool(), response_batch_shm_ptr->error)); - RespondErrorToAllRequests( - error_message_shm->String().c_str(), responses, requests, - request_count); - } else { - const char* error_message = - "Failed to fetch the error in response batch."; - RespondErrorToAllRequests( - error_message, responses, requests, request_count); - } - - // Reset the release flags for all the requests. - for (auto& infer_request : pb_infer_requests) { - infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - } - return; - } - - bi::managed_external_buffer::handle_t* response_shm_handle = - reinterpret_cast( - response_batch.data_.get() + sizeof(ResponseBatch)); - - // If the output provided by the model is in GPU, we will pass the list of - // buffers provided by Triton to the stub process. - bool has_gpu_output = false; - std::vector requires_deferred_callback; - - std::vector> shm_responses; - std::vector, void*>>> - gpu_output_buffers(request_count); - GPUBuffersHelper gpu_buffer_helper; - - for (uint32_t r = 0; r < request_count; ++r) { - NVTX_RANGE(nvtx_, "LoadingResponse " + Name()); - TRITONBACKEND_Response* response = (*responses)[r]; - TRITONBACKEND_Request* request = requests[r]; - uint32_t requested_output_count = 0; - requires_deferred_callback.push_back(false); - - shm_responses.emplace_back(nullptr); - std::unique_ptr& infer_response = shm_responses.back(); - try { - if (pb_infer_requests[r]->ReleaseFlags() == - TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { - // For rescheduled requests, we do not need to send a response. - LOG_IF_ERROR( - TRITONBACKEND_ResponseDelete((*responses)[r]), - "failed to delete response"); - (*responses)[r] = nullptr; - continue; - } - infer_response = InferResponse::LoadFromSharedMemory( - Stub()->ShmPool(), response_shm_handle[r], - false /* open_cuda_handle */); - if (infer_response->HasError()) { - TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( - infer_response->Error()->Code(), - infer_response->Error()->Message().c_str()); - - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - TRITONSERVER_ErrorDelete(err); - (*responses)[r] = nullptr; - - // Reset the release flags for the request. - pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - - // If has_error is true, we do not look at the response tensors. - continue; - } - } - catch (const PythonBackendException& pb_exception) { - TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - TRITONSERVER_ErrorDelete(err); - (*responses)[r] = nullptr; - - // Reset the release flags for the request. - pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - - continue; - } - - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_RequestOutputCount(request, &requested_output_count)); - - std::set requested_output_names; - for (size_t j = 0; j < requested_output_count; ++j) { - const char* output_name; - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_RequestOutputName(request, j, &output_name)); - requested_output_names.insert(output_name); - } - - bool require_deferred_callback = false; - -#ifdef TRITON_ENABLE_GPU - for (auto& output_tensor : infer_response->OutputTensors()) { - if (output_tensor->MemoryType() == TRITONSERVER_MEMORY_GPU) { - // Attempt to use the cuda shared memory pool for GPU tensor. - ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); - } - } -#endif // TRITON_ENABLE_GPU - - gpu_output_buffers[r] = - std::vector, void*>>{}; - infer_response->Send( - response, CudaStream(), require_deferred_callback, - TRITONSERVER_RESPONSE_COMPLETE_FINAL, Stub()->ShmPool(), - gpu_buffer_helper, gpu_output_buffers[r], requested_output_names); - - requires_deferred_callback[r] = require_deferred_callback; - - if (requires_deferred_callback[r]) { - has_gpu_output = true; - } - } - - // Finalize the execute. - execute_finalize.Complete(); - - // If the output tensor is in GPU, there will be a second round trip - // required for filling the GPU buffers provided by the main process. - if (has_gpu_output) { - ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; - gpu_buffer_helper.Complete(Stub()->ShmPool()); - ipc_message->Args() = gpu_buffer_helper.ShmHandle(); - SendMessageAndReceiveResponse( - ipc_message->ShmHandle(), response_message, restart, responses, - requests, 0); - - bool cuda_copy = false; - - uint32_t response_index = 0; - for (auto& gpu_output_buffer : gpu_output_buffers) { - for (auto& buffer_memory_pair : gpu_output_buffer) { - auto& pb_memory = buffer_memory_pair.first; - void* pointer = buffer_memory_pair.second; - bool cuda_used = false; - - if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { - GUARDED_RESPOND_IF_ERROR( - responses, response_index, - CopyBuffer( - "Failed to copy the output tensor to buffer.", - TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, - pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, - CudaStream(), &cuda_used)); - cuda_copy |= cuda_used; - } else if ( - (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) && - pb_memory->UseCUDASharedPool() && - (pb_memory->DataPtr() != pointer)) { - // If the data pointer from pb_memory is not the same as the pointer, - // it means that the Triton-provided buffer is not used during tensor - // transfer. Instead, an intermediate buffer that uses CUDA shared - // memory pool is used. In this case, we need to copy the data - // from the intermediate buffer back to the Triton-provided buffer. - GUARDED_RESPOND_IF_ERROR( - responses, response_index, - CopyBuffer( - "Failed to copy the output tensor to buffer.", - TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), - TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), - pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, - CudaStream(), &cuda_used)); - cuda_copy |= cuda_used; - } - } - response_index++; -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - } -#endif // TRITON_ENABLE_GPU - } - } - - bls_defer.Complete(); - for (uint32_t r = 0; r < request_count; ++r) { - if (requires_deferred_callback[r]) { - shm_responses[r]->DeferredSendCallback(); - } - } - - uint64_t exec_end_ns = 0; - SET_TIMESTAMP(exec_end_ns); - reporter.SetExecEndNs(exec_end_ns); - reporter.SetBatchStatistics(total_batch_size); - - return; -} - void ModelInstanceState::PrepareResponseBatch( ResponseBatch** response_batch, @@ -1873,18 +1386,13 @@ ModelInstanceState::ShareCUDAMemoryPool(const int32_t device_id) ModelInstanceState::~ModelInstanceState() { - ModelState* model_state = reinterpret_cast(Model()); Stub()->UpdateHealth(); if (Stub()->IsHealthy()) { - if (model_state->IsDecoupled()) { - // Wait for all the pending tasks to finish. - thread_pool_->wait(); - // Push a dummy message to signal the thread to terminate. - Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); - decoupled_monitor_.join(); - } else { - thread_pool_->wait(); - } + // Wait for all the pending tasks to finish. + thread_pool_->wait(); + // Push a dummy message to signal the thread to terminate. + Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); + queue_monitor_.join(); } // Terminate stub first to allow any last messages to be received by the back // end before deallocating the queue memory @@ -2445,36 +1953,10 @@ TRITONBACKEND_ModelInstanceExecute( // If restart is equal to true, it indicates that the stub process is // unhealthy and needs a restart. - bool restart = false; - ModelState* model_state = - reinterpret_cast(instance_state->Model()); - std::vector> infer_requests; - if (!model_state->IsDecoupled()) { - instance_state->ProcessRequests( - requests, request_count, infer_requests, restart); + // TODO: Implement restart on decoupled - if (restart) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - "Stub process is unhealthy and it will be restarted."); - instance_state->TerminateMonitor(); - instance_state->Stub()->KillStubProcess(); - TRITONSERVER_Error* err = instance_state->Stub()->Setup(); - if (err == nullptr) { - instance_state->StartMonitor(); - } - LOG_IF_ERROR(err, "Failed to restart the stub process."); - err = instance_state->Stub()->Launch(); - LOG_IF_ERROR( - err, - "Failed to restart the stub process: failed to launch " - "the stub process."); - // Reset the release flags for all the requests. - for (auto& infer_request : infer_requests) { - infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - } - } - } else { + std::vector> infer_requests; + { uint64_t exec_start_ns = 0; SET_TIMESTAMP(exec_start_ns); @@ -2483,7 +1965,7 @@ TRITONBACKEND_ModelInstanceExecute( nullptr); reporter.SetExecStartNs(exec_start_ns); - error = instance_state->ProcessRequestsDecoupled( + error = instance_state->ProcessRequests( requests, request_count, infer_requests, reporter); uint64_t exec_end_ns = 0; diff --git a/src/python_be.h b/src/python_be.h index 9618204c..59660fc4 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -1,4 +1,4 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -287,9 +287,9 @@ class ModelInstanceState : public BackendModelInstance { std::thread stub_to_parent_queue_monitor_; bool stub_to_parent_thread_; - // Decoupled monitor thread - std::thread decoupled_monitor_; - bool decoupled_thread_; + // Queue monitor thread + std::thread queue_monitor_; + bool queue_monitor_thread_; std::mutex mu_; std::condition_variable cv_; std::unique_ptr received_message_; @@ -309,30 +309,12 @@ class ModelInstanceState : public BackendModelInstance { // Launch stub process. TRITONSERVER_Error* LaunchStubProcess(); - TRITONSERVER_Error* SendMessageToStub( - bi::managed_external_buffer::handle_t message); void ResponseSendDecoupled(std::shared_ptr response_send_message); - // Checks whether the stub process is live - bool IsStubProcessAlive(); - - // Get a message from the stub process - void SendMessageAndReceiveResponse( - bi::managed_external_buffer::handle_t message, - bi::managed_external_buffer::handle_t& response, bool& restart, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // Responds to all the requests with an error message. - void RespondErrorToAllRequests( - const char* message, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // In the decoupled mode, the parent message queue is monitored only by this - // function during the execute phase. No other thread should pop any message - // from the message queue in the decoupled mode. - void DecoupledMessageQueueMonitor(); + // The parent message queue is monitored only by this function during the + // execute phase. No other thread should pop any message from the message + // queue. + void MessageQueueMonitor(); // This function is executed on a separate thread and monitors the queue for // message sent from stub to parent process. @@ -347,14 +329,8 @@ class ModelInstanceState : public BackendModelInstance { TRITONBACKEND_Request* request, std::shared_ptr>& responses); - // Process all the requests obtained from Triton. - void ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_infer_requests, - bool& restart); - // Process all the requests in the decoupled mode. - TRITONSERVER_Error* ProcessRequestsDecoupled( + TRITONSERVER_Error* ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector>& pb_infer_requests, PbMetricReporter& pb_metric_reporter); @@ -368,9 +344,6 @@ class ModelInstanceState : public BackendModelInstance { // Cleanup BLS responses void CleanupBLSResponses(); - // Wait for BLS requests to complete - void WaitForBLSRequestsToFinish(); - // Check the incoming requests for errors TRITONSERVER_Error* CheckIncomingRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, diff --git a/src/response_sender.cc b/src/response_sender.cc index 94e3f0c8..74914ab4 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -1,4 +1,4 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -35,13 +35,31 @@ namespace triton { namespace backend { namespace python { +void +CheckResponseSenderArguments( + const std::shared_ptr& response, const uint32_t flags) +{ + // Check the correctness of the provided flags. + if (flags != TRITONSERVER_RESPONSE_COMPLETE_FINAL && flags != 0) { + throw PythonBackendException( + "Unable to send response. Unsupported flag provided."); + } + + if (flags == 0 && response == nullptr) { + throw PythonBackendException( + "Inference Response object must be provided when the response flags is " + "set to zero."); + } +} + ResponseSender::ResponseSender( intptr_t request_address, intptr_t response_factory_address, - std::unique_ptr& shm_pool, + bool const* is_decoupled, std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel) : request_address_(request_address), - response_factory_address_(response_factory_address), shm_pool_(shm_pool), - closed_(false), pb_cancel_(pb_cancel) + response_factory_address_(response_factory_address), + is_decoupled_(is_decoupled), shm_pool_(shm_pool), pb_cancel_(pb_cancel), + closed_(false), number_of_response_sent_(0) { } @@ -54,15 +72,32 @@ ResponseSender::~ResponseSender() } void -ResponseSender::Send( - std::shared_ptr infer_response, const uint32_t flags) +ResponseSender::UpdateStateAndCounters( + const std::shared_ptr& response, const uint32_t flags) { - // Release the GIL. This avoids a potential deadlock situation in the parent - // process, where every thread in the thread pool is indirectly waiting for a - // function in the stub process that acquires the GIL. Meanwhile, the current - // thread, which holds the GIL, is also waiting for the parent side to have - // the next available thread to pick up the job during resource contention. - py::gil_scoped_release release; + if (is_decoupled_ == nullptr) { + // TODO: Can a model access the response sender on a BLS infer request? + throw PythonBackendException( + "Unable to send response. Response sender has no reference to the " + "decoupled state of the model."); + } + bool is_decoupled = *is_decoupled_; + + std::lock_guard lk(mu_); + + if (!is_decoupled) { + if (response != nullptr && number_of_response_sent_ > 0) { + throw PythonBackendException( + "Unable to send response. Non-decoupled model cannot send more than " + "one response."); + } + if (response == nullptr && flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL && + number_of_response_sent_ == 0) { + throw PythonBackendException( + "Unable to send response. Non-decoupled model cannot send complete " + "final before sending a response."); + } + } if (closed_) { throw PythonBackendException( @@ -72,18 +107,22 @@ ResponseSender::Send( if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { closed_ = true; } + number_of_response_sent_++; +} - // Check the correctness of the provided flags. - if (flags != TRITONSERVER_RESPONSE_COMPLETE_FINAL && flags != 0) { - throw PythonBackendException( - "Unable to send response. Unsupported flag provided."); - } +void +ResponseSender::Send( + std::shared_ptr infer_response, const uint32_t flags) +{ + // Release the GIL. This avoids a potential deadlock situation in the parent + // process, where every thread in the thread pool is indirectly waiting for a + // function in the stub process that acquires the GIL. Meanwhile, the current + // thread, which holds the GIL, is also waiting for the parent side to have + // the next available thread to pick up the job during resource contention. + py::gil_scoped_release release; - if (flags == 0 && infer_response == nullptr) { - throw PythonBackendException( - "Inference Response object must be provided when the response flags is " - "set to zero."); - } + CheckResponseSenderArguments(infer_response, flags); + UpdateStateAndCounters(infer_response, flags); std::unique_ptr& stub = Stub::GetOrCreateInstance(); @@ -147,9 +186,26 @@ ResponseSender::Send( } if (has_gpu_output) { + ScopedDefer _([send_message_payload] { + bi::scoped_lock guard{send_message_payload->mu}; + send_message_payload->is_stub_turn = false; + send_message_payload->cv.notify_one(); + while (!send_message_payload->is_stub_turn) { + // Wait for the stub process to send the response and populate error + // message if any. + send_message_payload->cv.wait(guard); + } + }); + AllocatedSharedMemory gpu_buffers_handle = shm_pool_->Load( send_message_payload->gpu_buffers_handle); + if (!gpu_buffers_handle.data_->success) { + std::unique_ptr error = PbString::LoadFromSharedMemory( + shm_pool_, gpu_buffers_handle.data_->error); + throw PythonBackendException( + "Failed to load GPU buffers: " + error->String()); + } AllocatedSharedMemory gpu_buffers_handle_shm = @@ -157,12 +213,11 @@ ResponseSender::Send( gpu_buffers_handle.data_->buffers); uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count; if (gpu_tensors.size() != gpu_buffer_count) { - LOG_ERROR - << (std::string( - "GPU buffers size does not match the provided buffers: ") + - std::to_string(gpu_tensors.size()) + - " != " + std::to_string(gpu_buffer_count)); - return; + throw PythonBackendException( + std::string( + "GPU buffers size does not match the provided buffers: ") + + std::to_string(gpu_tensors.size()) + + " != " + std::to_string(gpu_buffer_count)); } std::vector> dst_buffers; @@ -175,17 +230,6 @@ ResponseSender::Send( std::shared_ptr& src_buffer = gpu_tensors[i]; PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory()); } - - { - bi::scoped_lock guard{send_message_payload->mu}; - send_message_payload->is_stub_turn = false; - send_message_payload->cv.notify_one(); - while (!send_message_payload->is_stub_turn) { - // Wait for the stub process to send the response and populate error - // message if any. - send_message_payload->cv.wait(guard); - } - } } if (send_message_payload->has_error) { @@ -206,4 +250,11 @@ ResponseSender::IsCancelled() return pb_cancel_->IsCancelled(); } +void +ResponseSender::Close() +{ + std::lock_guard lk(mu_); + closed_ = true; +} + }}} // namespace triton::backend::python diff --git a/src/response_sender.h b/src/response_sender.h index d29a6ab6..1b57508e 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -1,4 +1,4 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,6 +26,8 @@ #pragma once +#include + #include "infer_response.h" #include "pb_cancel.h" #include "shm_manager.h" @@ -36,17 +38,27 @@ class ResponseSender { public: ResponseSender( intptr_t request_address, intptr_t response_factory_address, - std::unique_ptr& shm_pool, + bool const* is_decoupled, std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel); ~ResponseSender(); void Send(std::shared_ptr response, const uint32_t flags); bool IsCancelled(); + // Can be useful at stopping the model from sending any more responses. + void Close(); + private: + void UpdateStateAndCounters( + const std::shared_ptr& response, const uint32_t flags); + intptr_t request_address_; intptr_t response_factory_address_; + bool const* is_decoupled_; std::unique_ptr& shm_pool_; - bool closed_; std::shared_ptr pb_cancel_; + + std::mutex mu_; + bool closed_; + size_t number_of_response_sent_; }; }}} // namespace triton::backend::python From ebc8c6cd5d9a04981b1d24dc9e5db9e2d5a81974 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Fri, 7 Jun 2024 16:01:24 -0700 Subject: [PATCH 26/54] fix: [precaution fix] Capture Python futures while running in the background (#365) * Capture futures while running in background * Scoped defer background future removal * Use pybind11 provided python set --- src/pb_stub.cc | 61 +++++++++++++++++++++++++++++++++----------------- src/pb_stub.h | 3 +++ 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 87410a70..2a6be556 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -107,27 +107,8 @@ PyDefaultArgumentToMutableType(const py::object& argument) void AsyncEventFutureDoneCallback(const py::object& py_future) { - // TODO: Why using `py_future.result()` with error hangs on exit? - try { - py::object exception = py_future.attr("exception")(); - if (!py::isinstance(exception)) { - std::string err_msg = ""; - py::object traceback = py::module_::import("traceback") - .attr("TracebackException") - .attr("from_exception")(exception) - .attr("format")(); - for (py::handle line : traceback) { - err_msg += py::str(line); - } - LOG_ERROR << err_msg; - } - } - catch (const PythonBackendException& pb_exception) { - LOG_ERROR << pb_exception.what(); - } - catch (const py::error_already_set& error) { - LOG_ERROR << error.what(); - } + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + stub->BackgroundFutureDone(py_future); } void @@ -556,6 +537,7 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); async_event_loop_ = py::none(); + background_futures_ = py::set(); py::object TritonPythonModel = sys.attr("TritonPythonModel"); deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor"); @@ -838,11 +820,47 @@ Stub::RunCoroutine(py::object coroutine, bool in_background) py_future.attr("add_done_callback")( py::module_::import("c_python_backend_utils") .attr("async_event_future_done_callback")); + background_futures_.attr("add")(py_future); return py::none(); } return py_future.attr("result")(); } +void +Stub::BackgroundFutureDone(const py::object& py_future) +{ + ScopedDefer _([this, &py_future] { + // Remove future from background + try { + background_futures_.attr("remove")(py_future); + } + catch (const py::error_already_set& error) { + LOG_ERROR << "Cannot remove future from background; " << error.what(); + } + }); + // TODO: Why using `py_future.result()` with error hangs on exit? + try { + py::object exception = py_future.attr("exception")(); + if (!py::isinstance(exception)) { + std::string err_msg = ""; + py::object traceback = py::module_::import("traceback") + .attr("TracebackException") + .attr("from_exception")(exception) + .attr("format")(); + for (py::handle line : traceback) { + err_msg += py::str(line); + } + LOG_ERROR << err_msg; + } + } + catch (const PythonBackendException& pb_exception) { + LOG_ERROR << pb_exception.what(); + } + catch (const py::error_already_set& error) { + LOG_ERROR << error.what(); + } +} + void Stub::UpdateHealth() { @@ -923,6 +941,7 @@ Stub::~Stub() { py::gil_scoped_acquire acquire; async_event_loop_ = py::none(); + background_futures_ = py::none(); model_instance_ = py::none(); } stub_instance_.reset(); diff --git a/src/pb_stub.h b/src/pb_stub.h index 10e7606a..9ed74d9a 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -260,6 +260,8 @@ class Stub { py::object RunCoroutine(py::object coroutine, bool in_background); + void BackgroundFutureDone(const py::object& py_future); + /// Get the memory manager message queue std::unique_ptr>& MemoryManagerQueue(); @@ -367,6 +369,7 @@ class Stub { py::object deserialize_bytes_; py::object serialize_bytes_; py::object async_event_loop_; + py::object background_futures_; std::unique_ptr> stub_message_queue_; std::unique_ptr> From bfabfdbf4aa1e3db36aaf9e640b1ce5e0a720f48 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Wed, 12 Jun 2024 09:35:43 -0700 Subject: [PATCH 27/54] fix: Models should filter outputs based on requested outputs (#366) * Prune non requested outputs from non-decoupled models * Prune non requested outputs from decoupled models * [chore] Remove redundant copy --- src/infer_request.cc | 8 ++++---- src/response_sender.cc | 12 +++++++++--- src/response_sender.h | 5 ++++- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index 57ea6cf1..8a95b524 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -68,14 +68,13 @@ InferRequest::InferRequest( } } - inputs_ = inputs; - requested_output_names_ = requested_output_names; #ifdef TRITON_PB_STUB pb_cancel_ = std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( request_address_, response_factory_address_, nullptr /* is_decoupled */, - Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); + RequestedOutputNames(), Stub::GetOrCreateInstance()->SharedMemory(), + pb_cancel_); #endif } @@ -390,7 +389,8 @@ InferRequest::InferRequest( std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( request_address_, response_factory_address_, is_model_decoupled, - Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); + RequestedOutputNames(), Stub::GetOrCreateInstance()->SharedMemory(), + pb_cancel_); #endif } diff --git a/src/response_sender.cc b/src/response_sender.cc index 74914ab4..1831601f 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -54,12 +54,15 @@ CheckResponseSenderArguments( ResponseSender::ResponseSender( intptr_t request_address, intptr_t response_factory_address, - bool const* is_decoupled, std::unique_ptr& shm_pool, + bool const* is_decoupled, + const std::set& requested_output_names, + std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel) : request_address_(request_address), response_factory_address_(response_factory_address), - is_decoupled_(is_decoupled), shm_pool_(shm_pool), pb_cancel_(pb_cancel), - closed_(false), number_of_response_sent_(0) + is_decoupled_(is_decoupled), + requested_output_names_(requested_output_names), shm_pool_(shm_pool), + pb_cancel_(pb_cancel), closed_(false), number_of_response_sent_(0) { } @@ -123,6 +126,9 @@ ResponseSender::Send( CheckResponseSenderArguments(infer_response, flags); UpdateStateAndCounters(infer_response, flags); + if (infer_response) { + infer_response->PruneOutputTensors(requested_output_names_); + } std::unique_ptr& stub = Stub::GetOrCreateInstance(); diff --git a/src/response_sender.h b/src/response_sender.h index 1b57508e..f274f5b4 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -38,7 +38,9 @@ class ResponseSender { public: ResponseSender( intptr_t request_address, intptr_t response_factory_address, - bool const* is_decoupled, std::unique_ptr& shm_pool, + bool const* is_decoupled, + const std::set& requested_output_names, + std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel); ~ResponseSender(); void Send(std::shared_ptr response, const uint32_t flags); @@ -54,6 +56,7 @@ class ResponseSender { intptr_t request_address_; intptr_t response_factory_address_; bool const* is_decoupled_; + std::set requested_output_names_; std::unique_ptr& shm_pool_; std::shared_ptr pb_cancel_; From c8b188f26a4e80c7204baaf73e27f11c33f52f57 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 14 Jun 2024 22:53:49 -0700 Subject: [PATCH 28/54] Add windows typedef for ssize_t (#368) --- src/pb_tensor.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index d9d47784..0915c1d9 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -35,6 +35,11 @@ namespace py = pybind11; #endif #include "pb_tensor.h" +// WAR for undefined ssize_t on Windows: https://stackoverflow.com/a/35368387 +#if defined(_MSC_VER) +#include +typedef SSIZE_T ssize_t; +#endif namespace triton { namespace backend { namespace python { From 2b12abeba3e612633483093dcfc09a771bcedfaa Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 30 Jul 2024 14:14:06 -0700 Subject: [PATCH 29/54] feat: Add BF16 tensor support via dlpack (#371) --- README.md | 4 ++++ src/pb_stub_utils.cc | 17 ++++++++++++++++- src/pb_tensor.cc | 24 ++++++++++++++++++------ 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 30f2dd25..eee6af39 100644 --- a/README.md +++ b/README.md @@ -1557,6 +1557,10 @@ input0 = pb_utils.Tensor.from_dlpack("INPUT0", pytorch_tensor) This method only supports contiguous Tensors that are in C-order. If the tensor is not C-order contiguous an exception will be raised. +For python models with input or output tensors of type BFloat16 (BF16), the +`as_numpy()` method is not supported, and the `from_dlpack` and `to_dlpack` +methods must be used instead. + ## `pb_utils.Tensor.is_cpu() -> bool` This function can be used to check whether a tensor is placed in CPU or not. diff --git a/src/pb_stub_utils.cc b/src/pb_stub_utils.cc index c9ffd661..9e05feae 100644 --- a/src/pb_stub_utils.cc +++ b/src/pb_stub_utils.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -168,6 +168,8 @@ triton_to_pybind_dtype(TRITONSERVER_DataType data_type) dtype_numpy = py::dtype(py::format_descriptor::format()); break; case TRITONSERVER_TYPE_BF16: + // NOTE: Currently skipping this call via `if (BF16)` check, but may + // want to better handle this or set some default/invalid dtype. throw PythonBackendException("TYPE_BF16 not currently supported."); case TRITONSERVER_TYPE_INVALID: throw PythonBackendException("Dtype is invalid."); @@ -240,6 +242,10 @@ triton_to_dlpack_type(TRITONSERVER_DataType triton_dtype) case TRITONSERVER_TYPE_BYTES: throw PythonBackendException( "TYPE_BYTES tensors cannot be converted to DLPack."); + case TRITONSERVER_TYPE_BF16: + dl_code = DLDataTypeCode::kDLBfloat; + dt_size = 16; + break; default: throw PythonBackendException( @@ -301,6 +307,15 @@ dlpack_to_triton_type(const DLDataType& data_type) } } + if (data_type.code == DLDataTypeCode::kDLBfloat) { + if (data_type.bits != 16) { + throw PythonBackendException( + "Expected BF16 tensor to have 16 bits, but had: " + + std::to_string(data_type.bits)); + } + return TRITONSERVER_TYPE_BF16; + } + return TRITONSERVER_TYPE_INVALID; } }}} // namespace triton::backend::python diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index 0915c1d9..1ab95144 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -152,7 +152,10 @@ PbTensor::PbTensor( #ifdef TRITON_PB_STUB if (memory_type_ == TRITONSERVER_MEMORY_CPU || memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) { - if (dtype != TRITONSERVER_TYPE_BYTES) { + if (dtype == TRITONSERVER_TYPE_BF16) { + // No native numpy representation for BF16. DLPack should be used instead. + numpy_array_ = py::none(); + } else if (dtype != TRITONSERVER_TYPE_BYTES) { py::object numpy_array = py::array(triton_to_pybind_dtype(dtype_), dims_, (void*)memory_ptr_); numpy_array_ = numpy_array.attr("view")(triton_to_numpy_type(dtype_)); @@ -512,12 +515,18 @@ PbTensor::Name() const const py::array* PbTensor::AsNumpy() const { - if (IsCPU()) { - return &numpy_array_; - } else { + if (!IsCPU()) { throw PythonBackendException( "Tensor is stored in GPU and cannot be converted to NumPy."); } + + if (dtype_ == TRITONSERVER_TYPE_BF16) { + throw PythonBackendException( + "Tensor dtype is BF16 and cannot be converted to NumPy. Use " + "to_dlpack() and from_dlpack() instead."); + } + + return &numpy_array_; } #endif // TRITON_PB_STUB @@ -643,7 +652,10 @@ PbTensor::PbTensor( #ifdef TRITON_PB_STUB if (memory_type_ == TRITONSERVER_MEMORY_CPU || memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) { - if (dtype_ != TRITONSERVER_TYPE_BYTES) { + if (dtype_ == TRITONSERVER_TYPE_BF16) { + // No native numpy representation for BF16. DLPack should be used instead. + numpy_array_ = py::none(); + } else if (dtype_ != TRITONSERVER_TYPE_BYTES) { py::object numpy_array = py::array(triton_to_pybind_dtype(dtype_), dims_, (void*)memory_ptr_); numpy_array_ = numpy_array.attr("view")(triton_to_numpy_type(dtype_)); From 4d469a904f34440d2ba90f775088ad4637b46c0c Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Wed, 31 Jul 2024 09:02:29 -0700 Subject: [PATCH 30/54] refactor: Refactor string input checks (#370) Refactor string input tensor checks --- src/python_be.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/python_be.cc b/src/python_be.cc index cd31e79e..2212176d 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -424,6 +424,15 @@ ModelInstanceState::GetInputTensor( RETURN_IF_ERROR(backend::ReadInputTensor( request, input_name, input_buffer, &byte_size)); } + + if (input_dtype == TRITONSERVER_TYPE_BYTES) { + const char* content = reinterpret_cast(input_tensor->DataPtr()); + size_t content_byte_size = input_tensor->ByteSize(); + const size_t request_element_cnt = GetElementCount(input_tensor->Dims()); + RETURN_IF_ERROR(ValidateStringBuffer( + content, content_byte_size, request_element_cnt, input_name, + nullptr /* str_list */)); + } } else { #ifdef TRITON_ENABLE_GPU // Attempt to use the cuda shared memory pool for GPU tensor. From 2203a5bcb729b56fc56ef1b3b77e527e0e9faa93 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:40:00 -0700 Subject: [PATCH 31/54] Delete response factory after sending complete final (#373) --- src/response_sender.cc | 24 +++++++++++++++++++----- src/response_sender.h | 4 ++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/response_sender.cc b/src/response_sender.cc index 1831601f..0a88fb6b 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -62,16 +62,14 @@ ResponseSender::ResponseSender( response_factory_address_(response_factory_address), is_decoupled_(is_decoupled), requested_output_names_(requested_output_names), shm_pool_(shm_pool), - pb_cancel_(pb_cancel), closed_(false), number_of_response_sent_(0) + pb_cancel_(pb_cancel), closed_(false), number_of_response_sent_(0), + response_factory_deleted_(false) { } ResponseSender::~ResponseSender() { - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - stub->EnqueueCleanupId( - reinterpret_cast(response_factory_address_), - PYTHONSTUB_DecoupledResponseFactoryCleanup); + DeleteResponseFactory(); } void @@ -248,6 +246,10 @@ ResponseSender::Send( "An error occurred while sending a response."); } } + + if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + DeleteResponseFactory(); + } } bool @@ -263,4 +265,16 @@ ResponseSender::Close() closed_ = true; } +void +ResponseSender::DeleteResponseFactory() +{ + bool already_deleted = response_factory_deleted_.exchange(true); + if (!already_deleted) { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + stub->EnqueueCleanupId( + reinterpret_cast(response_factory_address_), + PYTHONSTUB_DecoupledResponseFactoryCleanup); + } +} + }}} // namespace triton::backend::python diff --git a/src/response_sender.h b/src/response_sender.h index f274f5b4..69f416c2 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -26,6 +26,7 @@ #pragma once +#include #include #include "infer_response.h" @@ -52,6 +53,7 @@ class ResponseSender { private: void UpdateStateAndCounters( const std::shared_ptr& response, const uint32_t flags); + void DeleteResponseFactory(); intptr_t request_address_; intptr_t response_factory_address_; @@ -63,5 +65,7 @@ class ResponseSender { std::mutex mu_; bool closed_; size_t number_of_response_sent_; + + std::atomic response_factory_deleted_; }; }}} // namespace triton::backend::python From 1393d6e1866c28a051253a08cf7c928bcbd1cad3 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:57:10 -0700 Subject: [PATCH 32/54] Release GIL during cancellation check (#372) --- src/pb_cancel.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc index 4c9b926b..0774261d 100644 --- a/src/pb_cancel.cc +++ b/src/pb_cancel.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -57,6 +57,9 @@ PbCancel::ShmPayload() bool PbCancel::IsCancelled() { + // Release the GIL. Python objects are not accessed during the check. + py::gil_scoped_release gil_release; + std::unique_lock lk(mu_); // The cancelled flag can only move from false to true, not the other way, so // it is checked on each query until cancelled and then implicitly cached. From 958c8c9c54ddf3350520e53a71c9b2369f67011f Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:23:04 -0700 Subject: [PATCH 33/54] feat: Add new histogram metric type (#374) --- README.md | 6 +-- src/ipc_message.h | 3 +- src/metric.cc | 106 +++++++++++++++++++++++++++++++++++++++---- src/metric.h | 26 +++++++++-- src/metric_family.cc | 32 +++++++++++-- src/metric_family.h | 11 +++-- src/pb_stub.cc | 6 ++- src/pb_utils.h | 4 +- src/python_be.cc | 4 +- 9 files changed, 169 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index eee6af39..913034a8 100644 --- a/README.md +++ b/README.md @@ -1656,12 +1656,12 @@ import triton_python_backend_utils as pb_utils class TritonPythonModel: def initialize(self, args): # Create a MetricFamily object to report the latency of the model - # execution. The 'kind' parameter must be either 'COUNTER' or - # 'GAUGE'. + # execution. The 'kind' parameter must be either 'COUNTER', + # 'GAUGE' or 'HISTOGRAM'. self.metric_family = pb_utils.MetricFamily( name="preprocess_latency_ns", description="Cumulative time spent pre-processing requests", - kind=pb_utils.MetricFamily.COUNTER # or pb_utils.MetricFamily.GAUGE + kind=pb_utils.MetricFamily.COUNTER ) # Create a Metric object under the MetricFamily object. The 'labels' diff --git a/src/ipc_message.h b/src/ipc_message.h index ac28238c..8e762b8f 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -63,6 +63,7 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_MetricRequestValue, PYTHONSTUB_MetricRequestIncrement, PYTHONSTUB_MetricRequestSet, + PYTHONSTUB_MetricRequestObserve, PYTHONSTUB_LoadModelRequest, PYTHONSTUB_UnloadModelRequest, PYTHONSTUB_ModelReadinessRequest, diff --git a/src/metric.cc b/src/metric.cc index f67c55bf..7796b161 100644 --- a/src/metric.cc +++ b/src/metric.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -32,9 +32,12 @@ namespace triton { namespace backend { namespace python { -Metric::Metric(const std::string& labels, void* metric_family_address) - : labels_(labels), operation_value_(0), metric_address_(nullptr), - metric_family_address_(metric_family_address), is_cleared_(false) +Metric::Metric( + const std::string& labels, std::optional> buckets, + void* metric_family_address) + : labels_(labels), buckets_(buckets), operation_value_(0), + metric_address_(nullptr), metric_family_address_(metric_family_address), + is_cleared_(false) { #ifdef TRITON_PB_STUB SendCreateMetricRequest(); @@ -62,6 +65,20 @@ Metric::SaveToSharedMemory(std::unique_ptr& shm_pool) custom_metric_shm_ptr_->metric_family_address = metric_family_address_; custom_metric_shm_ptr_->metric_address = metric_address_; + // Histogram specific case + if (buckets_.has_value()) { + auto buckets_size = buckets_.value().size() * sizeof(double); + std::unique_ptr buckets_shm = PbMemory::Create( + shm_pool, TRITONSERVER_MemoryType::TRITONSERVER_MEMORY_CPU, 0, + buckets_size, reinterpret_cast(buckets_.value().data()), + false /* copy_gpu */); + custom_metric_shm_ptr_->buckets_shm_handle = buckets_shm->ShmHandle(); + buckets_shm_ = std::move(buckets_shm); + } else { + custom_metric_shm_ptr_->buckets_shm_handle = 0; + buckets_shm_ = nullptr; + } + // Save the references to shared memory. custom_metric_shm_ = std::move(custom_metric_shm); labels_shm_ = std::move(labels_shm); @@ -80,17 +97,40 @@ Metric::LoadFromSharedMemory( std::unique_ptr labels_shm = PbString::LoadFromSharedMemory( shm_pool, custom_metric_shm_ptr->labels_shm_handle); - return std::unique_ptr(new Metric(custom_metric_shm, labels_shm)); + std::unique_ptr buckets_shm = nullptr; + if (custom_metric_shm_ptr->buckets_shm_handle != 0) { + buckets_shm = PbMemory::LoadFromSharedMemory( + shm_pool, custom_metric_shm_ptr->buckets_shm_handle, + false /* open_cuda_handle */); + } + + return std::unique_ptr( + new Metric(custom_metric_shm, labels_shm, buckets_shm)); } Metric::Metric( AllocatedSharedMemory& custom_metric_shm, - std::unique_ptr& labels_shm) + std::unique_ptr& labels_shm, + std::unique_ptr& buckets_shm) : custom_metric_shm_(std::move(custom_metric_shm)), - labels_shm_(std::move(labels_shm)) + labels_shm_(std::move(labels_shm)), buckets_shm_(std::move(buckets_shm)) { custom_metric_shm_ptr_ = custom_metric_shm_.data_.get(); + + // FIXME: This constructor is called during each + // set/increment/observe/get_value call. It only needs the pointers. labels_ = labels_shm_->String(); + if (buckets_shm_ != nullptr) { // Histogram + size_t bucket_size = buckets_shm_->ByteSize() / sizeof(double); + std::vector buckets; + buckets.reserve(bucket_size); + for (size_t i = 0; i < bucket_size; ++i) { + buckets.emplace_back( + reinterpret_cast(buckets_shm_->DataPtr())[i]); + } + buckets_ = std::move(buckets); + } + operation_value_ = custom_metric_shm_ptr_->operation_value; metric_family_address_ = custom_metric_shm_ptr_->metric_family_address; metric_address_ = custom_metric_shm_ptr_->metric_address; @@ -161,6 +201,24 @@ Metric::SendSetValueRequest(const double& value) } } +void +Metric::SendObserveRequest(const double& value) +{ + try { + CheckIfCleared(); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + operation_value_ = value; + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory custom_metrics_shm; + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestObserve, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to observe metric value: " + std::string(pb_exception.what())); + } +} + double Metric::SendGetValueRequest() { @@ -222,14 +280,35 @@ Metric::InitializeTritonMetric() { std::vector labels_params; ParseLabels(labels_params, labels_); + TRITONSERVER_MetricKind kind; + THROW_IF_TRITON_ERROR(TRITONSERVER_GetMetricFamilyKind( + reinterpret_cast(metric_family_address_), + &kind)); + TRITONSERVER_MetricArgs* args = nullptr; + switch (kind) { + case TRITONSERVER_METRIC_KIND_COUNTER: + case TRITONSERVER_METRIC_KIND_GAUGE: + break; + case TRITONSERVER_METRIC_KIND_HISTOGRAM: { + const std::vector& buckets = buckets_.value(); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsNew(&args)); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsSetHistogram( + args, buckets.data(), buckets.size())); + break; + } + default: + break; + } + TRITONSERVER_Metric* triton_metric = nullptr; - THROW_IF_TRITON_ERROR(TRITONSERVER_MetricNew( + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricNewWithArgs( &triton_metric, reinterpret_cast(metric_family_address_), - labels_params.data(), labels_params.size())); + labels_params.data(), labels_params.size(), args)); for (const auto label : labels_params) { TRITONSERVER_ParameterDelete(const_cast(label)); } + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsDelete(args)); return reinterpret_cast(triton_metric); } @@ -262,6 +341,8 @@ Metric::HandleMetricOperation( Increment(operation_value_); } else if (command_type == PYTHONSTUB_MetricRequestSet) { SetValue(operation_value_); + } else if (command_type == PYTHONSTUB_MetricRequestObserve) { + Observe(operation_value_); } else { throw PythonBackendException("Unknown metric operation"); } @@ -281,6 +362,13 @@ Metric::SetValue(const double& value) THROW_IF_TRITON_ERROR(TRITONSERVER_MetricSet(triton_metric, value)); } +void +Metric::Observe(const double& value) +{ + auto triton_metric = reinterpret_cast(metric_address_); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricObserve(triton_metric, value)); +} + double Metric::GetValue() { diff --git a/src/metric.h b/src/metric.h index 197e8ce9..cd54ca54 100644 --- a/src/metric.h +++ b/src/metric.h @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,9 +26,11 @@ #pragma once +#include #include #include "ipc_message.h" +#include "pb_memory.h" #include "pb_string.h" #include "pb_utils.h" @@ -47,6 +49,8 @@ namespace triton { namespace backend { namespace python { struct MetricShm { // The shared memory handle of the labels in PbString format. bi::managed_external_buffer::handle_t labels_shm_handle; + // The shared memory handle of the buckets in PbMemory format. + bi::managed_external_buffer::handle_t buckets_shm_handle; // The value used for incrementing or setting the metric. double operation_value; // The address of the TRITONSERVER_Metric object. @@ -58,7 +62,10 @@ struct MetricShm { class Metric { public: - Metric(const std::string& labels, void* metric_family_address); + Metric( + const std::string& labels, + std::optional> buckets, + void* metric_family_address); ~Metric(); @@ -97,6 +104,10 @@ class Metric { /// \param value The value to set the metric to. void SendSetValueRequest(const double& value); + /// Send the request to the parent process to observe the value to the metric. + /// \param value The value to set the metric to. + void SendObserveRequest(const double& value); + /// Send the request to the parent process to get the value of the metric. /// \return Returns the value of the metric. double SendGetValueRequest(); @@ -132,6 +143,10 @@ class Metric { /// \param value The value to set the metric to. void SetValue(const double& value); + /// Use Triton C API to sample the observation to the metric. + /// \param value The value to sample observation to the metric. + void Observe(const double& value); + /// Use Triton C API to get the value of the metric. double GetValue(); @@ -146,10 +161,14 @@ class Metric { // The private constructor for creating a Metric object from shared memory. Metric( AllocatedSharedMemory& custom_metric_shm, - std::unique_ptr& labels_shm); + std::unique_ptr& labels_shm, + std::unique_ptr& buckets); // The labels of the metric, which is the identifier of the metric. std::string labels_; + // Monotonically increasing values representing bucket boundaries for creating + // histogram metric. + std::optional> buckets_; // The value used for incrementing or setting the metric. double operation_value_; // The address of the TRITONSERVER_Metric object. @@ -168,6 +187,7 @@ class Metric { MetricShm* custom_metric_shm_ptr_; bi::managed_external_buffer::handle_t shm_handle_; std::unique_ptr labels_shm_; + std::unique_ptr buckets_shm_; }; }}}; // namespace triton::backend::python diff --git a/src/metric_family.cc b/src/metric_family.cc index 77e8aedf..222a0e23 100644 --- a/src/metric_family.cc +++ b/src/metric_family.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -166,19 +166,39 @@ MetricFamily::SendCreateMetricFamilyRequest() } std::shared_ptr -MetricFamily::CreateMetric(const py::object& labels) +MetricFamily::CreateMetric(const py::object& labels, const py::object& buckets) { if (!labels.is_none()) { if (!py::isinstance(labels)) { throw PythonBackendException( - "Failed to create metric. Labels must be a " - "dictionary."); + "Failed to create metric. Labels must be a dictionary."); } } py::module json = py::module_::import("json"); std::string labels_str = std::string(py::str(json.attr("dumps")(labels))); - auto metric = std::make_shared(labels_str, metric_family_address_); + + std::optional> buckets_vec; + if (!buckets.is_none()) { + if (!py::isinstance(buckets)) { + throw PythonBackendException( + "Failed to create metric. Buckets must be a list."); + } + if (kind_ == kCounter || kind_ == kGauge) { + throw PythonBackendException( + "Failed to create metric. Unexpected buckets found."); + } + buckets_vec = buckets.cast>(); + } else { + if (kind_ == kHistogram) { + throw PythonBackendException( + "Failed to create metric. Missing required buckets."); + } + buckets_vec = std::nullopt; + } + + auto metric = + std::make_shared(labels_str, buckets_vec, metric_family_address_); { std::lock_guard lock(metric_map_mu_); metric_map_.insert({metric->MetricAddress(), metric}); @@ -205,6 +225,8 @@ MetricFamily::ToTritonServerMetricKind(const MetricKind& kind) return TRITONSERVER_METRIC_KIND_COUNTER; case kGauge: return TRITONSERVER_METRIC_KIND_GAUGE; + case kHistogram: + return TRITONSERVER_METRIC_KIND_HISTOGRAM; default: throw PythonBackendException("Unknown metric kind"); } diff --git a/src/metric_family.h b/src/metric_family.h index 04374a68..2b5f86ab 100644 --- a/src/metric_family.h +++ b/src/metric_family.h @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -97,8 +97,11 @@ class MetricFamily { /// Create a metric from the metric family and store it in the metric map. /// \param labels The labels of the metric. + /// \param buckets Monotonically increasing values representing bucket + /// boundaries for creating histogram metric. /// \return Returns the shared pointer to the created metric. - std::shared_ptr CreateMetric(const py::object& labels); + std::shared_ptr CreateMetric( + const py::object& labels, const py::object& buckets); #else /// Initialize the TRITONSERVER_MetricFamily object. /// \return Returns the address of the TRITONSERVER_MetricFamily object. @@ -128,8 +131,8 @@ class MetricFamily { std::string name_; // The description of the metric family. std::string description_; - // The metric kind of the metric family. Currently only supports GAUGE and - // COUNTER. + // The metric kind of the metric family. Currently only supports GAUGE, + // COUNTER and HISTOGRAM. MetricKind kind_; // The address of the TRITONSERVER_MetricFamily object. void* metric_family_address_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 2a6be556..007e7f29 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1824,11 +1824,13 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::class_>(module, "Metric") .def("increment", &Metric::SendIncrementRequest) .def("set", &Metric::SendSetValueRequest) + .def("observe", &Metric::SendObserveRequest) .def("value", &Metric::SendGetValueRequest); py::enum_(module, "MetricKind") .value("COUNTER", MetricKind::kCounter) .value("GAUGE", MetricKind::kGauge) + .value("HISTOGRAM", MetricKind::kHistogram) .export_values(); py::class_>( @@ -1839,9 +1841,11 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("kind").none(false)) .def( "Metric", &MetricFamily::CreateMetric, - py::arg("labels").none(true) = py::none()); + py::arg("labels").none(true) = py::none(), + py::arg("buckets").none(true) = py::none()); module.attr("MetricFamily").attr("COUNTER") = MetricKind::kCounter; module.attr("MetricFamily").attr("GAUGE") = MetricKind::kGauge; + module.attr("MetricFamily").attr("HISTOGRAM") = MetricKind::kHistogram; module.def( "load_model", &LoadModel, py::arg("model_name").none(false), diff --git a/src/pb_utils.h b/src/pb_utils.h index 1a6c2d8b..e68cfb0f 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -171,7 +171,7 @@ struct ResponseBatch : SendMessageBase { enum LogLevel { kInfo = 0, kWarning, kError, kVerbose }; -enum MetricKind { kCounter = 0, kGauge }; +enum MetricKind { kCounter = 0, kGauge, kHistogram }; struct LogSendMessage : SendMessageBase { bi::managed_external_buffer::handle_t filename; diff --git a/src/python_be.cc b/src/python_be.cc index 2212176d..761abdbf 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -758,7 +758,8 @@ ModelInstanceState::StubToParentMQMonitor() case PYTHONSTUB_MetricRequestDelete: case PYTHONSTUB_MetricRequestValue: case PYTHONSTUB_MetricRequestIncrement: - case PYTHONSTUB_MetricRequestSet: { + case PYTHONSTUB_MetricRequestSet: + case PYTHONSTUB_MetricRequestObserve: { ProcessMetricRequest(message); break; } @@ -978,6 +979,7 @@ ModelInstanceState::ProcessMetricRequest( } case PYTHONSTUB_MetricRequestIncrement: case PYTHONSTUB_MetricRequestSet: + case PYTHONSTUB_MetricRequestObserve: case PYTHONSTUB_MetricRequestValue: { metric->HandleMetricOperation(metrics_message_ptr, command); break; From d84bb57b06c985162fe537b46593281589bfb4d5 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 28 Aug 2024 13:50:43 -0400 Subject: [PATCH 34/54] Improve the documentation for custom Python backend stubs (#377) * Improve the documentation for custom Python backend stubs * Review comment --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 913034a8..ffcef26c 100644 --- a/README.md +++ b/README.md @@ -809,8 +809,8 @@ Python version is different from Python 3.10 which is shipped by default in the Triton containers.** Python backend uses a *stub* process to connect your `model.py` file to the -Triton C++ core. This stub process has an embedded Python interpreter with -a fixed Python version. If you intend to use a Python interpreter with +Triton C++ core. This stub process dynamically links to a specific +`libpython..so` version. If you intend to use a Python interpreter with different version from the default Python backend stub, you need to compile your own Python backend stub by following the steps below: From 35a1c1fad5104c9c4149dd7fee69585d99bb6009 Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Sat, 21 Sep 2024 13:16:39 -0700 Subject: [PATCH 35/54] perf: vLLM metrics optimization (#379) --- src/metric.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/metric.cc b/src/metric.cc index 7796b161..4c055910 100644 --- a/src/metric.cc +++ b/src/metric.cc @@ -167,6 +167,7 @@ Metric::SendCreateMetricRequest() void Metric::SendIncrementRequest(const double& value) { + py::gil_scoped_release release; try { CheckIfCleared(); std::unique_ptr& stub = Stub::GetOrCreateInstance(); @@ -204,6 +205,7 @@ Metric::SendSetValueRequest(const double& value) void Metric::SendObserveRequest(const double& value) { + py::gil_scoped_release release; try { CheckIfCleared(); std::unique_ptr& stub = Stub::GetOrCreateInstance(); From a2564eae71bd7b090c3694921036cac67bd018f2 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Tue, 8 Oct 2024 12:59:28 -0700 Subject: [PATCH 36/54] Add back 24.05 response sending path to fix performance (#381) * Add back 24.05 response sender path * Improve perf * Fix cleanup * Review comments * Fix up * Fix up * Fix response factory cleanup * Fix segfault * Fix error handling * Remove extra logs * Fix up, add comments * Address comment * Fix up --------- Co-authored-by: Iman Tabrizian --- src/infer_request.cc | 2 +- src/infer_request.h | 1 + src/ipc_message.cc | 23 +++ src/ipc_message.h | 9 + src/pb_stub.cc | 146 +++++++++++--- src/pb_stub.h | 5 +- src/pb_utils.h | 3 + src/python_be.cc | 438 +++++++++++++++++++++++++++++++++++------ src/python_be.h | 21 +- src/response_sender.cc | 23 ++- src/response_sender.h | 5 +- 11 files changed, 579 insertions(+), 97 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index 8a95b524..e5733662 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -484,7 +484,7 @@ InferRequest::Exec(const bool is_decoupled) { bi::scoped_lock lock{ *(ipc_message->ResponseMutex())}; - stub->SendIPCMessage(ipc_message); + stub->SendIPCUtilsMessage(ipc_message); ipc_message->ResponseCondition()->wait(lock); } diff --git a/src/infer_request.h b/src/infer_request.h index c67e2fb0..f368d692 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -96,6 +96,7 @@ class InferRequest { InferenceTrace& GetTrace(); uint32_t ReleaseFlags(); void SetReleaseFlags(const uint32_t& flags); + intptr_t GetResponseFactoryAddress() { return response_factory_address_; } #ifdef TRITON_PB_STUB std::shared_ptr Exec(const bool is_decoupled); diff --git a/src/ipc_message.cc b/src/ipc_message.cc index ea1dc5b0..2fa13ba3 100644 --- a/src/ipc_message.cc +++ b/src/ipc_message.cc @@ -56,6 +56,21 @@ IPCMessage::Create( new IPCMessage(ipc_message_shm, response_mutex_shm, response_cond_shm)); } +std::unique_ptr +IPCMessage::Create( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& message_handle) +{ + return std::unique_ptr( + new IPCMessage(ipc_message_shm, message_handle)); +} + +AllocatedSharedMemory& +IPCMessage::GetAllocatedSharedMemory() +{ + return ipc_message_shm_; +} + std::unique_ptr IPCMessage::LoadFromSharedMemory( std::unique_ptr& shm_pool, @@ -133,4 +148,12 @@ IPCMessage::IPCMessage( ipc_message_handle_ = ipc_message_shm_.handle_; } +IPCMessage::IPCMessage( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& handle) +{ + ipc_message_handle_ = handle; + ipc_message_shm_ptr_ = ipc_message_shm; +} + }}}; // namespace triton::backend::python diff --git a/src/ipc_message.h b/src/ipc_message.h index 8e762b8f..c3d1472e 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -97,6 +97,10 @@ class IPCMessage { static std::unique_ptr Create( const std::unique_ptr& shm_pool, bool inline_response); + + static std::unique_ptr Create( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& message_handle); static std::unique_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t message_handle); @@ -108,6 +112,7 @@ class IPCMessage { bi::interprocess_mutex* ResponseMutex(); bi::managed_external_buffer::handle_t& Args(); bi::managed_external_buffer::handle_t ShmHandle(); + AllocatedSharedMemory& GetAllocatedSharedMemory(); private: AllocatedSharedMemory ipc_message_shm_; @@ -129,6 +134,10 @@ class IPCMessage { AllocatedSharedMemory& ipc_message_shm, AllocatedSharedMemory& response_mutex_shm, AllocatedSharedMemory& response_cond_shm); + + IPCMessage( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& handle); }; }}}; // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 007e7f29..a26719d2 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -653,27 +653,20 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) { py::list py_request_list = LoadRequestsFromSharedMemory(request_batch_shm_ptr); - std::unique_ptr execute_response = - IPCMessage::Create(shm_pool_, false /* Inline response */); - execute_response->Command() = PYTHONSTUB_ExecuteResponse; + std::unique_ptr execute_response; - AllocatedSharedMemory response_batch = - shm_pool_->Construct(); - ResponseBatch* response_batch_shm_ptr = - reinterpret_cast(response_batch.data_.get()); - execute_response->Args() = response_batch.handle_; + std::optional> response_batch; bool has_exception = false; std::string error_string; std::unique_ptr error_string_shm; + std::string err_message; ScopedDefer execute_finalize([this] { stub_message_queue_->Pop(); }); ScopedDefer _( [this, &execute_response] { SendIPCMessage(execute_response); }); - + py::object execute_return; + py::object coroutine_return; try { - response_batch_shm_ptr->has_error = false; - response_batch_shm_ptr->is_error_set = false; - if (!py::hasattr(model_instance_, "execute")) { std::string message = "Python model " + model_context_.PythonModelPath() + " does not implement `execute` method."; @@ -683,8 +676,7 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) { NVTX_RANGE(nvtx_, "PyExecute " + name_); - py::object execute_return = - model_instance_.attr("execute")(py_request_list); + execute_return = model_instance_.attr("execute")(py_request_list); bool is_coroutine = py::module::import("asyncio") .attr("iscoroutine")(execute_return) @@ -694,12 +686,14 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) // Do not wait for async decoupled execute to return. RunCoroutine(execute_return, true /* in_background */); } else { - py::object coroutine_return = + coroutine_return = RunCoroutine(execute_return, false /* in_background */); - ProcessReturnedResponses(py_request_list, coroutine_return); + ProcessReturnedResponses( + py_request_list, coroutine_return, response_batch); } } else { - ProcessReturnedResponses(py_request_list, execute_return); + ProcessReturnedResponses( + py_request_list, execute_return, response_batch); } } } @@ -713,16 +707,36 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) } if (has_exception) { - std::string err_message = - std::string( - "Failed to process the request(s) for model '" + name_ + - "', message: ") + - error_string; + err_message = std::string( + "Failed to process the request(s) for model '" + name_ + + "', message: ") + + error_string; LOG_ERROR << err_message.c_str(); + if (!response_batch) { + response_batch = shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(IPCMessageShm)); + } + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + + // The backend will clean up the response factory if there is an error in + // the response batch. For decoupled mode, it is necessary to handle cases + // where the response sender should have already cleaned up, ensuring the + // backend does not delete the response factory again during error handling. + if (IsDecoupled()) { + for (py::handle py_request : py_request_list) { + InferRequest* request = py_request.cast(); + if (request->GetResponseSender()->IsClosed()) { + response_batch_shm_ptr->is_response_factory_deleted = true; + } + } + } + response_batch_shm_ptr->has_error = true; error_string_shm = PbString::Create(shm_pool_, err_message); response_batch_shm_ptr->error = error_string_shm->ShmHandle(); response_batch_shm_ptr->is_error_set = true; + response_batch_shm_ptr->batch_size = 0; // Once the error is sent to the backend, the backend is supposed to close // all response factories if not already closed, so closing all response // senders if not already closed to prevent the model from sending more @@ -731,12 +745,47 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) InferRequest* request = py_request.cast(); request->GetResponseSender()->Close(); } + } else { + if (!response_batch) { + response_batch = shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(IPCMessageShm)); + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + response_batch_shm_ptr->batch_size = 0; + } + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + response_batch_shm_ptr->has_error = false; + response_batch_shm_ptr->is_error_set = false; + } + + execute_response = IPCMessage::Create( + reinterpret_cast(response_batch.value().data_.get()), + response_batch.value().handle_); + execute_response->Args() = + response_batch.value().handle_ + sizeof(IPCMessageShm); + execute_response->InlineResponse() = false; + execute_response->Command() = PYTHONSTUB_ExecuteResponse; + _.Complete(); + execute_finalize.Complete(); +} + +void +Stub::ProcessResponse(InferResponse* response) +{ + response->SaveToSharedMemory(shm_pool_, false /* copy_gpu */); + + for (auto& output_tensor : response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + gpu_tensors_.push_back(output_tensor); + } } } void Stub::ProcessReturnedResponses( - py::list py_requests, py::object py_responses_obj) + py::list py_requests, py::object py_responses_obj, + std::optional>& response_batch) { // Return if there is nothing to process. if (py::isinstance(py_responses_obj)) { @@ -784,12 +833,55 @@ Stub::ProcessReturnedResponses( "return list, found type '" + std::string(py::str(py_responses[i].get_type())) + "'."); } - std::shared_ptr response = - py_responses[i].cast>(); - request->GetResponseSender()->Send( - response, TRITONSERVER_RESPONSE_COMPLETE_FINAL); + + InferResponse* response = py_responses[i].cast(); + try { + request->GetResponseSender()->UpdateStateAndCounters( + response, TRITONSERVER_RESPONSE_COMPLETE_FINAL); + } + catch (const PythonBackendException& pb_exception) { + // Handle the exception here to catch the error when there's a response + // returned from `execute()`. + if (request->GetResponseSender()->IsClosed()) { + response_batch = std::move(shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(IPCMessageShm))); + ResponseBatch* response_batch_shm_ptr = + reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + response_batch_shm_ptr->batch_size = 0; + response_batch_shm_ptr->is_response_factory_deleted = true; + } + throw pb_exception; + } + } + } + // Return all the created responses using response_batch. The reason + // that both of the paths are available is that sending the responses + // using response_batch is faster than using `response_sender`. + response_batch = std::move(shm_pool_->Construct( + sizeof(IPCMessageShm) + + requests_size * sizeof(bi::managed_external_buffer::handle_t) + + sizeof(ResponseBatch))); + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + + bi::managed_external_buffer::handle_t* responses_shm_handle = + reinterpret_cast( + response_batch.value().data_.get() + sizeof(ResponseBatch) + + sizeof(IPCMessageShm)); + for (size_t i = 0; i < responses_size; i++) { + // Check the return type of execute function. + InferRequest* infer_request = py_requests[i].cast(); + InferResponse* infer_response = py_responses[i].cast(); + if (!py::isinstance(py_responses[i])) { + infer_response->PruneOutputTensors(infer_request->RequestedOutputNames()); + ProcessResponse(infer_response); + responses_shm_handle[i] = infer_response->ShmHandle(); + } else { + responses_shm_handle[i] = 0; } } + response_batch_shm_ptr->batch_size = requests_size; } py::object diff --git a/src/pb_stub.h b/src/pb_stub.h index 9ed74d9a..7d76ec9a 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -254,7 +254,10 @@ class Stub { void ProcessRequests(RequestBatch* request_batch_shm_ptr); void ProcessReturnedResponses( - py::list py_requests, py::object py_responses_obj); + py::list py_requests, py::object py_responses_obj, + std::optional>& response_batch); + + void ProcessResponse(InferResponse* response); py::object GetAsyncEventLoop(); diff --git a/src/pb_utils.h b/src/pb_utils.h index e68cfb0f..aacf6b49 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -167,6 +167,9 @@ struct ResponseBatch : SendMessageBase { bool is_error_set; uint32_t response_size; + + // Indicates whether the response factory has been deleted or not. + bool is_response_factory_deleted = false; }; enum LogLevel { kInfo = 0, kWarning, kError, kVerbose }; diff --git a/src/python_be.cc b/src/python_be.cc index 761abdbf..bdf7b95f 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -153,6 +153,23 @@ ModelInstanceState::SetErrorForResponseSendMessage( } } +bool +ModelInstanceState::IsStubProcessAlive() +{ + boost::posix_time::ptime timeout = + boost::get_system_time() + boost::posix_time::seconds(1); + bi::scoped_lock lock(*Stub()->HealthMutex(), timeout); + + // Check if lock has been acquired. + if (lock) { + return Stub()->IpcControl()->stub_health; + } else { + // If It failed to obtain the lock, it means that the stub has been + // stuck or exited while holding the health mutex lock. + return false; + } +} + TRITONSERVER_Error* ModelInstanceState::SaveRequestsToSharedMemory( TRITONBACKEND_Request** requests, const uint32_t request_count, @@ -290,7 +307,7 @@ ModelInstanceState::SaveRequestsToSharedMemory( request, &request_timeout)); std::unique_ptr infer_request; - TRITONBACKEND_ResponseFactory* factory_ptr; + TRITONBACKEND_ResponseFactory* factory_ptr = nullptr; RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); infer_request = std::make_unique( @@ -322,8 +339,6 @@ ModelInstanceState::LaunchStubProcess() thread_pool_ = std::make_unique( model_state->StateForBackend()->thread_pool_size); - queue_monitor_thread_ = true; - queue_monitor_ = std::thread(&ModelInstanceState::MessageQueueMonitor, this); request_executor_ = std::make_unique( Stub()->ShmPool(), model_state->TritonServer()); @@ -685,44 +700,6 @@ ModelInstanceState::ExecuteBLSRequest( } } -void -ModelInstanceState::MessageQueueMonitor() -{ - while (queue_monitor_thread_) { - bi::managed_external_buffer::handle_t handle = - Stub()->ParentMessageQueue()->Pop(); - if (handle == DUMMY_MESSAGE) { - break; - } - std::unique_ptr message = - IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), handle); - - // Need to notify the model instance thread that the execute response has - // been received. - if (message->Command() == PYTHONSTUB_ExecuteResponse) { - std::lock_guard guard{mu_}; - received_message_ = std::move(message); - cv_.notify_one(); - } else if (message->Command() == PYTHONSTUB_ResponseSend) { - std::shared_ptr response_send_message = std::move(message); - std::packaged_task task([this, response_send_message] { - ResponseSendDecoupled(response_send_message); - }); - boost::asio::post(*thread_pool_, std::move(task)); - } else if ( - message->Command() == PYTHONSTUB_InferExecRequest || - message->Command() == PYTHONSTUB_InferStreamExecRequest) { - std::shared_ptr bls_execute = std::move(message); - std::packaged_task task([this, bls_execute] { - ExecuteBLSRequest( - bls_execute, - (bls_execute->Command() == PYTHONSTUB_InferStreamExecRequest)); - }); - boost::asio::post(*thread_pool_, std::move(task)); - } - } -} - void ModelInstanceState::StubToParentMQMonitor() { @@ -769,6 +746,25 @@ ModelInstanceState::StubToParentMQMonitor() ProcessModelControlRequest(message); break; } + case PYTHONSTUB_ResponseSend: { + std::shared_ptr response_send_message = std::move(message); + std::packaged_task task([this, response_send_message] { + ResponseSendDecoupled(response_send_message); + }); + boost::asio::post(*thread_pool_, std::move(task)); + break; + } + case PYTHONSTUB_InferExecRequest: + case PYTHONSTUB_InferStreamExecRequest: { + std::shared_ptr bls_execute = std::move(message); + std::packaged_task task([this, bls_execute] { + ExecuteBLSRequest( + bls_execute, + (bls_execute->Command() == PYTHONSTUB_InferStreamExecRequest)); + }); + boost::asio::post(*thread_pool_, std::move(task)); + break; + } default: { LOG_MESSAGE( TRITONSERVER_LOG_ERROR, "Unexpected message type received."); @@ -1030,6 +1026,100 @@ ModelInstanceState::ProcessModelControlRequest( }); } +TRITONSERVER_Error* +ModelInstanceState::SendMessageToStub( + bi::managed_external_buffer::handle_t message) +{ + bool success = false; + while (!success) { + uint64_t timeout_miliseconds = 1000; + { + boost::posix_time::ptime timeout = + boost::get_system_time() + + boost::posix_time::milliseconds(timeout_miliseconds); + + bi::scoped_lock lock( + *(Stub()->HealthMutex()), timeout); + + // Check if lock has been acquired. + if (lock) { + Stub()->IpcControl()->stub_health = false; + } else { + // If it failed to obtain the lock, it means that the stub has been + // stuck or exited while holding the health mutex lock. + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex."); + } + } + + Stub()->StubMessageQueue()->Push( + message, timeout_miliseconds /* duration ms */, success); + + if (!success && !IsStubProcessAlive()) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, "Stub process is not healthy."); + } + } + + return nullptr; // success +} + +void +ModelInstanceState::SendMessageAndReceiveResponse( + bi::managed_external_buffer::handle_t message, + bi::managed_external_buffer::handle_t& response, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count) +{ + auto error = SendMessageToStub(message); + if (error != nullptr) { + RespondErrorToAllRequests( + TRITONSERVER_ErrorMessage(error), responses, requests, request_count); + + return; + } + + bi::managed_external_buffer::handle_t response_message; + error = Stub()->ReceiveMessageFromStub(response_message); + if (error != nullptr) { + RespondErrorToAllRequests( + TRITONSERVER_ErrorMessage(error), responses, requests, request_count); + + return; + } + + response = response_message; +} + +void +ModelInstanceState::RespondErrorToAllRequests( + const char* message, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count) +{ + for (uint32_t r = 0; r < request_count; ++r) { + if ((*responses)[r] == nullptr) + continue; + + std::string err_message = + std::string( + "Failed to process the request(s) for model instance '" + Name() + + "', message: ") + + message; + + TRITONSERVER_Error* err = + TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message.c_str()); + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + "failed sending response"); + + (*responses)[r] = nullptr; + TRITONSERVER_ErrorDelete(err); + } +} + + void ModelInstanceState::StartMonitor() { @@ -1060,6 +1150,17 @@ ModelInstanceState::ResponseSendDecoupled( ResponseSendMessage* send_message_payload = reinterpret_cast(send_message.data_.get()); std::unique_ptr error_message; + ScopedDefer response_factory_deleter([send_message_payload] { + if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + send_message_payload->response_factory_address); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + lresponse_factory(reinterpret_cast( + response_factory)); + } + }); ScopedDefer _([send_message_payload] { { bi::scoped_lock guard{send_message_payload->mu}; @@ -1228,31 +1329,48 @@ ModelInstanceState::ProcessRequests( IPCMessage::Create(Stub()->ShmPool(), false /*inline_response*/)); ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; ipc_message->Args() = request_batch.handle_; - received_message_ = nullptr; - ScopedDefer _([this] { + + ScopedDefer execute_finalize([this] { // Push a dummy message to signal the thread to terminate. Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); }); + std::unique_ptr response; { - std::unique_lock guard{mu_}; Stub()->StubMessageQueue()->Push(ipc_message->ShmHandle()); - cv_.wait(guard, [this] { return received_message_ != nullptr; }); + bi::managed_external_buffer::handle_t response_message; + RETURN_IF_ERROR(Stub()->ReceiveMessageFromStub(response_message)); + response = + IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), response_message); } - - AllocatedSharedMemory response_batch = - Stub()->ShmPool()->Load(received_message_->Args()); - received_message_.reset(); + char* ipc_message_shm = + reinterpret_cast(response->GetAllocatedSharedMemory().data_.get()); + ResponseBatch* response_batch_shm_ptr = + reinterpret_cast(ipc_message_shm + sizeof(IPCMessageShm)); uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); reporter.SetComputeEndNs(compute_end_ns); reporter.SetBatchStatistics(total_batch_size); - if (response_batch.data_->has_error) { - if (response_batch.data_->is_error_set) { + if (response_batch_shm_ptr->has_error) { + // Clean up the response factory if an error occurred. The + // `is_response_factory_deleted` flag indicates whether the response factory + // has been deleted for some corner cases. + if (!response_batch_shm_ptr->is_response_factory_deleted) { + for (uint32_t r = 0; r < request_count; r++) { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + pb_infer_requests[r]->GetResponseFactoryAddress()); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + lresponse_factory(reinterpret_cast( + response_factory)); + } + } + if (response_batch_shm_ptr->is_error_set) { auto error = PbString::LoadFromSharedMemory( - Stub()->ShmPool(), response_batch.data_->error); + Stub()->ShmPool(), response_batch_shm_ptr->error); return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, error->String().c_str()); } @@ -1261,6 +1379,218 @@ ModelInstanceState::ProcessRequests( TRITONSERVER_ERROR_INTERNAL, "Failed to process the requests."); } + if (response_batch_shm_ptr->batch_size > 0) { + bi::managed_external_buffer::handle_t* response_shm_handle = + reinterpret_cast( + ipc_message_shm + sizeof(ResponseBatch) + sizeof(IPCMessageShm)); + + std::shared_ptr> responses( + new std::vector()); + responses->reserve(request_count); + for (size_t i = 0; i < request_count; i++) { + // It is possible to have multiple responses batched together in a single + // response batch shm, where some of the responses are None due to the + // usage of response sender, so only create a TRITONBACKEND_Response + // object for the valid responses. + if (response_shm_handle[i] == 0) { + responses->emplace_back(nullptr); + } else { + TRITONBACKEND_Response* response; + auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); + if (err == nullptr) { + responses->emplace_back(response); + } else { + responses->emplace_back(nullptr); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); + TRITONSERVER_ErrorDelete(err); + } + } + } + + std::vector requires_deferred_callback; + + bool has_gpu_output = false; + std::vector> shm_responses; + std::vector, void*>>> + gpu_output_buffers(request_count); + GPUBuffersHelper gpu_buffer_helper; + + for (uint32_t r = 0; r < request_count; ++r) { + NVTX_RANGE(nvtx_, "LoadingResponse " + Name()); + requires_deferred_callback.push_back(false); + if (response_shm_handle[r] == 0) { + continue; + } + TRITONBACKEND_Response* response = (*responses)[r]; + TRITONBACKEND_Request* request = requests[r]; + uint32_t requested_output_count = 0; + + shm_responses.emplace_back(nullptr); + std::unique_ptr& infer_response = shm_responses.back(); + try { + if (pb_infer_requests[r]->ReleaseFlags() == + TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + // For rescheduled requests, we do not need to send a response. + LOG_IF_ERROR( + TRITONBACKEND_ResponseDelete((*responses)[r]), + "failed to delete response"); + (*responses)[r] = nullptr; + continue; + } + { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + pb_infer_requests[r]->GetResponseFactoryAddress()); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + lresponse_factory( + reinterpret_cast( + response_factory)); + } + infer_response = InferResponse::LoadFromSharedMemory( + Stub()->ShmPool(), response_shm_handle[r], + false /* open_cuda_handle */); + if (infer_response->HasError()) { + TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( + infer_response->Error()->Code(), + infer_response->Error()->Message().c_str()); + + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + "failed sending response"); + TRITONSERVER_ErrorDelete(err); + (*responses)[r] = nullptr; + + // Reset the release flags for the request. + pb_infer_requests[r]->SetReleaseFlags( + TRITONSERVER_REQUEST_RELEASE_ALL); + + // If has_error is true, we do not look at the response tensors. + continue; + } + } + catch (const PythonBackendException& pb_exception) { + TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + "failed sending response"); + TRITONSERVER_ErrorDelete(err); + (*responses)[r] = nullptr; + + // Reset the release flags for the request. + pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); + + continue; + } + + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_RequestOutputCount(request, &requested_output_count)); + std::set requested_output_names; + for (size_t j = 0; j < requested_output_count; ++j) { + const char* output_name; + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_RequestOutputName(request, j, &output_name)); + requested_output_names.insert(output_name); + } + + bool require_deferred_callback = false; + +#ifdef TRITON_ENABLE_GPU + for (auto& output_tensor : infer_response->OutputTensors()) { + if (output_tensor->MemoryType() == TRITONSERVER_MEMORY_GPU) { + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); + } + } +#endif // TRITON_ENABLE_GPU + + gpu_output_buffers[r] = + std::vector, void*>>{}; + infer_response->Send( + response, CudaStream(), require_deferred_callback, + TRITONSERVER_RESPONSE_COMPLETE_FINAL, Stub()->ShmPool(), + gpu_buffer_helper, gpu_output_buffers[r], requested_output_names); + + requires_deferred_callback[r] = require_deferred_callback; + + if (requires_deferred_callback[r]) { + has_gpu_output = true; + } + } + + execute_finalize.Complete(); + + // If the output tensor is in GPU, there will be a second round trip + // required for filling the GPU buffers provided by the main process. + if (has_gpu_output) { + ipc_message->Command() = + PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; + gpu_buffer_helper.Complete(Stub()->ShmPool()); + ipc_message->Args() = gpu_buffer_helper.ShmHandle(); + bi::managed_external_buffer::handle_t response_message; + SendMessageAndReceiveResponse( + ipc_message->ShmHandle(), response_message, responses, requests, 0); + + bool cuda_copy = false; + + uint32_t response_index = 0; + for (auto& gpu_output_buffer : gpu_output_buffers) { + for (auto& buffer_memory_pair : gpu_output_buffer) { + auto& pb_memory = buffer_memory_pair.first; + void* pointer = buffer_memory_pair.second; + bool cuda_used = false; + + if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { + GUARDED_RESPOND_IF_ERROR( + responses, response_index, + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } else if ( + (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) && + pb_memory->UseCUDASharedPool() && + (pb_memory->DataPtr() != pointer)) { + // If the data pointer from pb_memory is not the same as the + // pointer, it means that the Triton-provided buffer is not used + // during tensor transfer. Instead, an intermediate buffer that uses + // CUDA shared memory pool is used. In this case, we need to copy + // the data from the intermediate buffer back to the Triton-provided + // buffer. + GUARDED_RESPOND_IF_ERROR( + responses, response_index, + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } + } + response_index++; +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream_); + } +#endif // TRITON_ENABLE_GPU + } + } + + for (uint32_t r = 0; r < request_count; ++r) { + if (requires_deferred_callback[r]) { + shm_responses[r]->DeferredSendCallback(); + } + } + } + return nullptr; // success } @@ -1401,16 +1731,12 @@ ModelInstanceState::~ModelInstanceState() if (Stub()->IsHealthy()) { // Wait for all the pending tasks to finish. thread_pool_->wait(); - // Push a dummy message to signal the thread to terminate. - Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); - queue_monitor_.join(); } // Terminate stub first to allow any last messages to be received by the back // end before deallocating the queue memory Stub()->TerminateStub(); TerminateMonitor(); Stub()->ClearQueues(); - received_message_.reset(); Stub().reset(); } diff --git a/src/python_be.h b/src/python_be.h index 59660fc4..c98e1284 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -287,9 +287,6 @@ class ModelInstanceState : public BackendModelInstance { std::thread stub_to_parent_queue_monitor_; bool stub_to_parent_thread_; - // Queue monitor thread - std::thread queue_monitor_; - bool queue_monitor_thread_; std::mutex mu_; std::condition_variable cv_; std::unique_ptr received_message_; @@ -361,6 +358,24 @@ class ModelInstanceState : public BackendModelInstance { AllocatedSharedMemory& request_batch, std::shared_ptr>& responses); + void SendMessageAndReceiveResponse( + bi::managed_external_buffer::handle_t message, + bi::managed_external_buffer::handle_t& response, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count); + + void RespondErrorToAllRequests( + const char* message, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count); + + // void SendMessageToStub(bi::managed_external_buffer::handle_t message); + TRITONSERVER_Error* SendMessageToStub( + bi::managed_external_buffer::handle_t message); + + // Checks whether the stub process is live + bool IsStubProcessAlive(); + // Model instance stub std::unique_ptr& Stub() { return model_instance_stub_; } diff --git a/src/response_sender.cc b/src/response_sender.cc index 0a88fb6b..ef3b09dd 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -74,7 +74,7 @@ ResponseSender::~ResponseSender() void ResponseSender::UpdateStateAndCounters( - const std::shared_ptr& response, const uint32_t flags) + InferResponse* response, const uint32_t flags) { if (is_decoupled_ == nullptr) { // TODO: Can a model access the response sender on a BLS infer request? @@ -106,6 +106,7 @@ ResponseSender::UpdateStateAndCounters( } if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + response_factory_deleted_.exchange(true); closed_ = true; } number_of_response_sent_++; @@ -123,7 +124,7 @@ ResponseSender::Send( py::gil_scoped_release release; CheckResponseSenderArguments(infer_response, flags); - UpdateStateAndCounters(infer_response, flags); + UpdateStateAndCounters(infer_response.get(), flags); if (infer_response) { infer_response->PruneOutputTensors(requested_output_names_); } @@ -172,7 +173,11 @@ ResponseSender::Send( { bi::scoped_lock guard{send_message_payload->mu}; - stub->SendIPCMessage(ipc_message); + // The server will destruct the response factory if the final flag is set. + if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + response_factory_deleted_.exchange(true); + } + stub->SendIPCUtilsMessage(ipc_message); while (!send_message_payload->is_stub_turn) { send_message_payload->cv.wait(guard); } @@ -246,10 +251,6 @@ ResponseSender::Send( "An error occurred while sending a response."); } } - - if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - DeleteResponseFactory(); - } } bool @@ -258,11 +259,19 @@ ResponseSender::IsCancelled() return pb_cancel_->IsCancelled(); } +bool +ResponseSender::IsClosed() +{ + std::lock_guard lk(mu_); + return closed_; +} + void ResponseSender::Close() { std::lock_guard lk(mu_); closed_ = true; + response_factory_deleted_.exchange(true); } void diff --git a/src/response_sender.h b/src/response_sender.h index 69f416c2..a696f9eb 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -43,16 +43,17 @@ class ResponseSender { const std::set& requested_output_names, std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel); + intptr_t ResponseFactory() { return response_factory_address_; } ~ResponseSender(); void Send(std::shared_ptr response, const uint32_t flags); bool IsCancelled(); + void UpdateStateAndCounters(InferResponse* response, const uint32_t flags); // Can be useful at stopping the model from sending any more responses. void Close(); + bool IsClosed(); private: - void UpdateStateAndCounters( - const std::shared_ptr& response, const uint32_t flags); void DeleteResponseFactory(); intptr_t request_address_; From 682db01bbec9d4b4ed18da80c37a55d9331b83ac Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Thu, 31 Oct 2024 10:30:25 -0700 Subject: [PATCH 37/54] build: RHEL8 Python Backend (#385) * PYBE RHEL --- CMakeLists.txt | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ee209b5b..d27f10a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,6 +92,18 @@ FetchContent_Declare( GIT_TAG "aa304c9c7d725ffb9d10af08a3b34cb372307020" GIT_SHALLOW ON ) + +# RHEL base container has multiple version of Python installed. By default +# it seems like pybind will pickup v3.6, so we specifically assign it to +# search for 3.12 here. +set(RHEL_BUILD OFF) +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") + set(RHEL_BUILD ON) + set(PYBIND11_PYTHON_VERSION 3.12) + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) FetchContent_MakeAvailable(pybind11) # @@ -268,6 +280,23 @@ target_compile_options( ) target_compile_definitions(triton-python-backend-stub PRIVATE TRITON_PB_STUB) +# RHEL assets are not released in a container environment nor do the current +# Python lib versions in the manylinux base container match those currently +# available for RHEL8 package managers. Therefore, we package the correct +# python libs in the backend folder and adjust the stub executable to look +# in its own folder at runtime. +if(RHEL_BUILD) + set_target_properties( + triton-python-backend-stub + PROPERTIES + SKIP_BUILD_RPATH TRUE + BUILD_WITH_INSTALL_RPATH TRUE + INSTALL_RPATH_USE_LINK_PATH FALSE + INSTALL_RPATH "$\{ORIGIN\}" + ) +endif(RHEL_BUILD) + + # For WIN32 do not link Threads and DL_LIBS if(WIN32) target_link_libraries( From 09c35373d66141ad052bccf4b1591d1f1ad8034c Mon Sep 17 00:00:00 2001 From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:47:19 -0500 Subject: [PATCH 38/54] updating pybind11 version (#384) --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d27f10a5..cc94b3aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,8 +88,8 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend) FetchContent_Declare( pybind11 GIT_REPOSITORY "https://github.com/pybind/pybind11" - # COMMIT ID for v2.10.0 - GIT_TAG "aa304c9c7d725ffb9d10af08a3b34cb372307020" + # COMMIT ID for v2.12.0 + GIT_TAG "3e9dfa2866941655c56877882565e7577de6fc7b" GIT_SHALLOW ON ) From f3068c03ed82e099cef5e2b40e9d1d79b1eab7ac Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:32:33 -0800 Subject: [PATCH 39/54] Build Updates for Ubuntu24.04 (#386) (#387) (#388) * skip warning errors * Revert "skip warning errors" This reverts commit 071c052dd876820776c27a792271ba4100a4ce8a. * Reapply "skip warning errors" This reverts commit a088c296484ee4a4dae60cde70111b4225524258. Co-authored-by: Anant Sharma --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cc94b3aa..0dc70f0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,7 +267,7 @@ target_compile_features(triton-python-backend PRIVATE cxx_std_${TRITON_MIN_CXX_S target_compile_options( triton-python-backend PRIVATE $<$,$,$>: - -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + -Wall -Wextra -Wno-unused-parameter -Wno-type-limits> $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) From 8e123478ecdf33ae781e6419cb2b84942a7365ff Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Tue, 10 Dec 2024 09:57:09 -0800 Subject: [PATCH 40/54] Remove Strict Requirement (#389) --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0dc70f0d..69c7c698 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,6 @@ if(LINUX) file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") set(RHEL_BUILD ON) - set(PYBIND11_PYTHON_VERSION 3.12) endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") endif(LINUX) FetchContent_MakeAvailable(pybind11) From b771f4f2f3fae3eb97c8f3624d268fd3947f96ea Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:05:59 -0800 Subject: [PATCH 41/54] fix: Fix requested output deleting extra outputs (#390) * fix: Hold GIL when deleting numpy array * chore: setting py obj to None may not destruct the object --- src/pb_stub.cc | 6 +++--- src/pb_tensor.cc | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index a26719d2..51df5aa2 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1032,9 +1032,9 @@ Stub::~Stub() { py::gil_scoped_acquire acquire; - async_event_loop_ = py::none(); - background_futures_ = py::none(); - model_instance_ = py::none(); + py::object async_event_loop_local(std::move(async_event_loop_)); + py::object background_futures_local(std::move(background_futures_)); + py::object model_instance_local(std::move(model_instance_)); } stub_instance_.reset(); stub_message_queue_.reset(); diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index 1ab95144..9fde62fe 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -503,6 +503,14 @@ PbTensor::~PbTensor() noexcept(false) { pb_memory_.reset(); DeleteDLPack(); + +#ifdef TRITON_PB_STUB + { + py::gil_scoped_acquire acquire; + py::array numpy_array_local(std::move(numpy_array_)); + py::array numpy_array_serialized_local(std::move(numpy_array_serialized_)); + } +#endif } const std::string& From 1ea48a6f7c3d4c27ceacc0ad1acdbe2002a0476c Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Fri, 24 Jan 2025 16:20:45 -0800 Subject: [PATCH 42/54] feat: Add parameters support to InferResponse (#394) * Add parameters support to InferResponse * Infer response to track parameters * Add parameters to binding infer response * Rank parameters argument up among InferResponse constructor arguments * Add setting parameters to Triton response * Send response parameters only on non-error * Fix double declaration * Unify py dictionary parameters to json str * Add documentation * Mark response parameters accessor const and JSON serializable * [Docs] Note BLS response parameters are not populated currently * [comment] Clarify why PbTensor::LoadFromSharedMemory() requires holding GIL --- README.md | 21 ++++++- src/infer_response.cc | 78 ++++++++++++++++++++---- src/infer_response.h | 13 ++-- src/pb_stub.cc | 131 ++++++++++++++++++++++------------------ src/request_executor.cc | 20 +++--- 5 files changed, 180 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index ffcef26c..a6242a44 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@