dolanp83
diff --git a/‎src/pb_tensor.cc‎
Lines changed: 32 additions & 16 deletions b/‎src/pb_tensor.cc‎
Lines changed: 32 additions & 16 deletions
diff --git a/‎src/pb_tensor.h‎
Lines changed: 1 addition & 2 deletions b/‎src/pb_tensor.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/python.cc‎
Lines changed: 7 additions & 10 deletions b/‎src/python.cc‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎src/request_executor.cc‎
Lines changed: 25 additions & 43 deletions b/‎src/request_executor.cc‎
Lines changed: 25 additions & 43 deletions
diff --git a/‎src/request_executor.h‎
Lines changed: 5 additions & 8 deletions b/‎src/request_executor.h‎
Lines changed: 5 additions & 8 deletions
@@ -122,16 +122,14 @@ PbTensor::PbTensor(
     const std::string& name, const std::vector<int64_t>& dims,
     TRITONSERVER_DataType dtype, TRITONSERVER_MemoryType memory_type,
     int64_t memory_type_id, void* memory_ptr, uint64_t byte_size,
-    DLManagedTensor* dl_managed_tensor,
-    bi::managed_external_buffer::handle_t shm_handle)
+    DLManagedTensor* dl_managed_tensor)
 {
   name_ = name;
   memory_ptr_ = memory_ptr;
   memory_type_ = memory_type;
   memory_type_id_ = memory_type_id;
   dtype_ = dtype;
   dims_ = dims;
-  // [FIXME] fix shm_handle
 
 #ifdef TRITON_PB_STUB
   if (memory_type_ == TRITONSERVER_MEMORY_CPU ||
@@ -400,9 +398,16 @@ PbTensor::SaveToSharedMemory(
     std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu)
 {
   if (!tensor_shm_.data_) {
-    uint64_t byte_size = sizeof(TensorShm) + sizeof(int64_t) * dims_.size() +
-                         PbString::ShmStructSize(name_) +
-                         PbMemory::ShmStructSize(memory_type_, byte_size_);
+    uint64_t byte_size;
+    if (!pb_memory_) {
+      byte_size = sizeof(TensorShm) + sizeof(int64_t) * dims_.size() +
+                  PbString::ShmStructSize(name_) +
+                  PbMemory::ShmStructSize(memory_type_, byte_size_);
+
+    } else {
+      byte_size = sizeof(TensorShm) + sizeof(int64_t) * dims_.size() +
+                  PbString::ShmStructSize(name_);
+    }
     tensor_shm_ = shm_pool->Construct<char>(byte_size);
 
     tensor_shm_ptr_ = reinterpret_cast<TensorShm*>(tensor_shm_.data_.get());
@@ -425,13 +430,17 @@ PbTensor::SaveToSharedMemory(
         shm_handle_ + name_offset);
     std::size_t pb_memory_offset = name_offset + PbString::ShmStructSize(name_);
 
-    pb_memory_ = PbMemory::Create(
-        memory_type_, memory_type_id_, byte_size_,
-        reinterpret_cast<char*>(memory_ptr_),
-        reinterpret_cast<char*>(tensor_shm_ptr_) + pb_memory_offset,
-        shm_handle_ + pb_memory_offset, copy_gpu);
+    if (!pb_memory_) {
+      pb_memory_ = PbMemory::Create(
+          memory_type_, memory_type_id_, byte_size_,
+          reinterpret_cast<char*>(memory_ptr_),
+          reinterpret_cast<char*>(tensor_shm_ptr_) + pb_memory_offset,
+          shm_handle_ + pb_memory_offset, copy_gpu);
+      tensor_shm_ptr_->memory = 0;
+    } else {
+      tensor_shm_ptr_->memory = pb_memory_->ShmHandle();
+    }
 
-    tensor_shm_ptr_->memory = pb_memory_->ShmHandle();
     memory_ptr_ = pb_memory_->DataPtr();
   }
 }
@@ -449,10 +458,17 @@ PbTensor::LoadFromSharedMemory(
   std::unique_ptr<PbString> name_shm = PbString::LoadFromSharedMemory(
       tensor_handle + name_offset, tensor_shm.data_.get() + name_offset);
 
-  std::size_t pb_memory_offset = name_offset + name_shm->Size();
-  std::unique_ptr<PbMemory> pb_memory = PbMemory::LoadFromSharedMemory(
-      pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset,
-      open_cuda_handle);
+  std::unique_ptr<PbMemory> pb_memory;
+  if (tensor_shm_ptr->memory == 0) {
+    std::size_t pb_memory_offset = name_offset + name_shm->Size();
+    pb_memory = PbMemory::LoadFromSharedMemory(
+        pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset,
+        open_cuda_handle);
+  } else {
+    pb_memory = PbMemory::LoadFromSharedMemory(
+        shm_pool, tensor_shm_ptr->memory, open_cuda_handle);
+  }
+
   return std::unique_ptr<PbTensor>(
       new PbTensor(tensor_shm, name_shm, pb_memory));
 }
 
@@ -96,8 +96,7 @@ class PbTensor {
       const std::string& name, const std::vector<int64_t>& dims,
       TRITONSERVER_DataType dtype, TRITONSERVER_MemoryType memory_type,
       int64_t memory_type_id, void* memory_ptr, uint64_t byte_size,
-      DLManagedTensor* dl_managed_tensor = nullptr,
-      bi::managed_external_buffer::handle_t shm_handle = 0);
+      DLManagedTensor* dl_managed_tensor = nullptr);
 
   /// This constructor is used when
   /// loading the tensor from shared memory.
 
@@ -307,9 +307,7 @@ class ModelInstanceState : public BackendModelInstance {
       bool& restart);
 
   // Execute a BLS Request
-  void ExecuteBLSRequest(
-      std::unique_ptr<SharedMemoryManager>& shm_pool,
-      bi::managed_external_buffer::handle_t message_offset);
+  void ExecuteBLSRequest(bi::managed_external_buffer::handle_t message_offset);
 
   // Cleanup BLS responses
   void CleanupBLSResponses();
@@ -959,14 +957,13 @@ ModelInstanceState::GetInputTensor(
 
 void
 ModelInstanceState::ExecuteBLSRequest(
-    std::unique_ptr<SharedMemoryManager>& shm_pool,
     bi::managed_external_buffer::handle_t message_offset)
 {
   ModelState* model_state = reinterpret_cast<ModelState*>(Model());
   auto request_executor =
-      std::make_unique<RequestExecutor>(model_state->TritonServer());
+      std::make_unique<RequestExecutor>(shm_pool_, model_state->TritonServer());
   std::unique_ptr<IPCMessage> ipc_message =
-      IPCMessage::LoadFromSharedMemory(shm_pool, message_offset);
+      IPCMessage::LoadFromSharedMemory(shm_pool_, message_offset);
   bool is_response_batch_set = false;
   std::unique_ptr<InferResponse> infer_response;
   ResponseBatch* response_batch;
@@ -1078,8 +1075,8 @@ ModelInstanceState::ExecuteBLSRequest(
       }
 
       if (pb_exception.what() != nullptr) {
-        infer_response = request_executor->Infer(
-            infer_request, shm_pool_, &inference_response);
+        infer_response =
+            request_executor->Infer(infer_request, &inference_response);
 
         if (infer_response) {
           infer_response->SaveToSharedMemory(shm_pool_);
@@ -1110,7 +1107,7 @@ ModelInstanceState::ExecuteBLSRequest(
     if (is_response_batch_set) {
       response_batch->has_error = true;
       LOG_IF_EXCEPTION(
-          pb_error_message = PbString::Create(shm_pool, pb_exception.what()));
+          pb_error_message = PbString::Create(shm_pool_, pb_exception.what()));
 
       if (pb_error_message != nullptr) {
         response_batch->is_error_set = true;
@@ -1370,7 +1367,7 @@ ModelInstanceState::ProcessRequests(
     // Launch the BLS request in a future.
     bls_futures_.emplace_back(
         std::async(std::launch::async, [this, current_message]() {
-          this->ExecuteBLSRequest(this->shm_pool_, current_message);
+          this->ExecuteBLSRequest(current_message);
         }));
 
     auto error = ReceiveMessageFromStub(response_message);
 
@@ -28,6 +28,7 @@
 
 #include <future>
 #include "pb_utils.h"
+#include "scoped_defer.h"
 #include "triton/backend/backend_common.h"
 #include "triton/core/tritonserver.h"
 
@@ -72,7 +73,10 @@ ResponseAlloc(
     void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
     int64_t* actual_memory_type_id)
 {
-  SharedMemoryManager* shm_pool = reinterpret_cast<SharedMemoryManager*>(userp);
+  std::unique_ptr<SharedMemoryManager> shm_pool(
+      reinterpret_cast<SharedMemoryManager*>(userp));
+
+  ScopedDefer _([&shm_pool] { shm_pool.release(); });
   *actual_memory_type = preferred_memory_type;
   *actual_memory_type_id = preferred_memory_type_id;
 
@@ -90,30 +94,20 @@ ResponseAlloc(
       case TRITONSERVER_MEMORY_CPU_PINNED: {
         *actual_memory_type = TRITONSERVER_MEMORY_CPU;
         *actual_memory_type_id = 0;
-        bi::managed_external_buffer::handle_t tensor_handle;
         try {
-          AllocatedSharedMemory<char> memory =
-              shm_pool->Construct<char>(byte_size);
-          *buffer = memory.data_.get();
-          tensor_handle = memory.handle_;
-
-          // Release the ownership to avoid deallocation. The buffer
-          // will be deallocated in ResponseRelease function.
-          memory.data_.release();
+          std::unique_ptr<PbMemory> pb_memory = PbMemory::Create(
+              shm_pool, *actual_memory_type, *actual_memory_type_id, byte_size,
+              nullptr /* data */, false /* copy_gpu */);
+          *buffer = pb_memory->DataPtr();
+          *buffer_userp = reinterpret_cast<void*>(pb_memory.get());
+          pb_memory.release();
         }
         catch (const PythonBackendException& pb_exception) {
           TRITONSERVER_Error* err =
               CreateTritonErrorFromException(pb_exception);
           return err;
         }
-        // Store the buffer offset in the userp; The userp is large enough to
-        // hold the shared memory offset and the address of the Shared memory
-        // manager
-        AllocationInfo* allocation_info = new AllocationInfo;
-        *buffer_userp = allocation_info;
-
-        allocation_info->handle_ = tensor_handle;
-        allocation_info->shm_manager_ = shm_pool;
+
       } break;
 #ifdef TRITON_ENABLE_GPU
       case TRITONSERVER_MEMORY_GPU: {
@@ -151,28 +145,12 @@ ResponseRelease(
     size_t byte_size, TRITONSERVER_MemoryType memory_type,
     int64_t memory_type_id)
 {
-  switch (memory_type) {
-    case TRITONSERVER_MEMORY_CPU:
-    case TRITONSERVER_MEMORY_CPU_PINNED: {
-      AllocationInfo* allocation_info =
-          reinterpret_cast<AllocationInfo*>(buffer_userp);
-      {
-        // Load the data so that it is deallocated automatically.
-        auto result = allocation_info->shm_manager_->Load<char>(
-            allocation_info->handle_, true /* unsafe */);
-      }
-
-      delete allocation_info;
-    } break;
-    case TRITONSERVER_MEMORY_GPU: {
-      // No action is required for the GPU tensors.
-    } break;
-  }
-
   return nullptr;  // Success
 }
 
-RequestExecutor::RequestExecutor(TRITONSERVER_Server* server) : server_(server)
+RequestExecutor::RequestExecutor(
+    std::unique_ptr<SharedMemoryManager>& shm_pool, TRITONSERVER_Server* server)
+    : server_(server), shm_pool_(shm_pool)
 {
   TRITONSERVER_ResponseAllocator* allocator;
   THROW_IF_TRITON_ERROR(TRITONSERVER_ResponseAllocatorNew(
@@ -183,7 +161,6 @@ RequestExecutor::RequestExecutor(TRITONSERVER_Server* server) : server_(server)
 std::unique_ptr<InferResponse>
 RequestExecutor::Infer(
     const std::shared_ptr<InferRequest>& infer_request,
-    const std::unique_ptr<SharedMemoryManager>& shm_pool,
     TRITONSERVER_InferenceResponse** triton_response)
 {
   std::unique_ptr<InferResponse> infer_response;
@@ -247,7 +224,7 @@ RequestExecutor::Infer(
       std::future<TRITONSERVER_InferenceResponse*> completed = p->get_future();
 
       THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback(
-          irequest, response_allocator_, shm_pool.get(), InferResponseComplete,
+          irequest, response_allocator_, shm_pool_.get(), InferResponseComplete,
           reinterpret_cast<void*>(p)));
 
       THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync(
@@ -284,16 +261,21 @@ RequestExecutor::Infer(
         // userp is only set for the CPU tensors
         if (memory_type != TRITONSERVER_MEMORY_GPU) {
           if (byte_size != 0) {
-            output_tensors.push_back(std::make_shared<PbTensor>(
+            std::shared_ptr<PbTensor> pb_tensor = std::make_shared<PbTensor>(
                 sname, dims_vector, datatype, memory_type, memory_type_id,
                 const_cast<void*>(base), byte_size,
-                nullptr /* DLManagedTensor */,
-                *(reinterpret_cast<off_t*>(userp))));
+                nullptr /* DLManagedTensor */);
+
+            // Load the data so that it is deallocated automatically.
+            std::unique_ptr<PbMemory> pb_memory(
+                reinterpret_cast<PbMemory*>(userp));
+            pb_tensor->SetMemory(std::move(pb_memory));
+            output_tensors.push_back(pb_tensor);
           } else {
             output_tensors.push_back(std::make_shared<PbTensor>(
                 sname, dims_vector, datatype, memory_type, memory_type_id,
                 const_cast<void*>(base), byte_size,
-                nullptr /* DLManagedTensor */, 0 /* shared memory offest */));
+                nullptr /* DLManagedTensor */));
           }
         } else {
           output_tensors.push_back(std::make_shared<PbTensor>(
 
@@ -32,22 +32,19 @@ namespace triton { namespace backend { namespace python {
 TRITONSERVER_Error* CreateTritonErrorFromException(
     const PythonBackendException& pb_exception);
 
-
-struct AllocationInfo {
-  bi::managed_external_buffer::handle_t handle_;
-  SharedMemoryManager* shm_manager_;
-};
-
 class RequestExecutor {
   TRITONSERVER_ResponseAllocator* response_allocator_ = nullptr;
   TRITONSERVER_Server* server_;
+  std::unique_ptr<SharedMemoryManager>& shm_pool_;
 
  public:
   std::unique_ptr<InferResponse> Infer(
       const std::shared_ptr<InferRequest>& infer_request,
-      const std::unique_ptr<SharedMemoryManager>& shm_pool,
       TRITONSERVER_InferenceResponse** response);
-  RequestExecutor(TRITONSERVER_Server* server);
+  RequestExecutor(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      TRITONSERVER_Server* server);
+
   ~RequestExecutor();
 };