Skip to content

Commit b7cec48

Browse files
authored
Fix BLS performance (triton-inference-server#135)
1 parent 41d6460 commit b7cec48

File tree

5 files changed

+70
-79
lines changed

5 files changed

+70
-79
lines changed

src/pb_tensor.cc

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -122,16 +122,14 @@ PbTensor::PbTensor(
122122
const std::string& name, const std::vector<int64_t>& dims,
123123
TRITONSERVER_DataType dtype, TRITONSERVER_MemoryType memory_type,
124124
int64_t memory_type_id, void* memory_ptr, uint64_t byte_size,
125-
DLManagedTensor* dl_managed_tensor,
126-
bi::managed_external_buffer::handle_t shm_handle)
125+
DLManagedTensor* dl_managed_tensor)
127126
{
128127
name_ = name;
129128
memory_ptr_ = memory_ptr;
130129
memory_type_ = memory_type;
131130
memory_type_id_ = memory_type_id;
132131
dtype_ = dtype;
133132
dims_ = dims;
134-
// [FIXME] fix shm_handle
135133

136134
#ifdef TRITON_PB_STUB
137135
if (memory_type_ == TRITONSERVER_MEMORY_CPU ||
@@ -400,9 +398,16 @@ PbTensor::SaveToSharedMemory(
400398
std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu)
401399
{
402400
if (!tensor_shm_.data_) {
403-
uint64_t byte_size = sizeof(TensorShm) + sizeof(int64_t) * dims_.size() +
404-
PbString::ShmStructSize(name_) +
405-
PbMemory::ShmStructSize(memory_type_, byte_size_);
401+
uint64_t byte_size;
402+
if (!pb_memory_) {
403+
byte_size = sizeof(TensorShm) + sizeof(int64_t) * dims_.size() +
404+
PbString::ShmStructSize(name_) +
405+
PbMemory::ShmStructSize(memory_type_, byte_size_);
406+
407+
} else {
408+
byte_size = sizeof(TensorShm) + sizeof(int64_t) * dims_.size() +
409+
PbString::ShmStructSize(name_);
410+
}
406411
tensor_shm_ = shm_pool->Construct<char>(byte_size);
407412

408413
tensor_shm_ptr_ = reinterpret_cast<TensorShm*>(tensor_shm_.data_.get());
@@ -425,13 +430,17 @@ PbTensor::SaveToSharedMemory(
425430
shm_handle_ + name_offset);
426431
std::size_t pb_memory_offset = name_offset + PbString::ShmStructSize(name_);
427432

428-
pb_memory_ = PbMemory::Create(
429-
memory_type_, memory_type_id_, byte_size_,
430-
reinterpret_cast<char*>(memory_ptr_),
431-
reinterpret_cast<char*>(tensor_shm_ptr_) + pb_memory_offset,
432-
shm_handle_ + pb_memory_offset, copy_gpu);
433+
if (!pb_memory_) {
434+
pb_memory_ = PbMemory::Create(
435+
memory_type_, memory_type_id_, byte_size_,
436+
reinterpret_cast<char*>(memory_ptr_),
437+
reinterpret_cast<char*>(tensor_shm_ptr_) + pb_memory_offset,
438+
shm_handle_ + pb_memory_offset, copy_gpu);
439+
tensor_shm_ptr_->memory = 0;
440+
} else {
441+
tensor_shm_ptr_->memory = pb_memory_->ShmHandle();
442+
}
433443

434-
tensor_shm_ptr_->memory = pb_memory_->ShmHandle();
435444
memory_ptr_ = pb_memory_->DataPtr();
436445
}
437446
}
@@ -449,10 +458,17 @@ PbTensor::LoadFromSharedMemory(
449458
std::unique_ptr<PbString> name_shm = PbString::LoadFromSharedMemory(
450459
tensor_handle + name_offset, tensor_shm.data_.get() + name_offset);
451460

452-
std::size_t pb_memory_offset = name_offset + name_shm->Size();
453-
std::unique_ptr<PbMemory> pb_memory = PbMemory::LoadFromSharedMemory(
454-
pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset,
455-
open_cuda_handle);
461+
std::unique_ptr<PbMemory> pb_memory;
462+
if (tensor_shm_ptr->memory == 0) {
463+
std::size_t pb_memory_offset = name_offset + name_shm->Size();
464+
pb_memory = PbMemory::LoadFromSharedMemory(
465+
pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset,
466+
open_cuda_handle);
467+
} else {
468+
pb_memory = PbMemory::LoadFromSharedMemory(
469+
shm_pool, tensor_shm_ptr->memory, open_cuda_handle);
470+
}
471+
456472
return std::unique_ptr<PbTensor>(
457473
new PbTensor(tensor_shm, name_shm, pb_memory));
458474
}

src/pb_tensor.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,7 @@ class PbTensor {
9696
const std::string& name, const std::vector<int64_t>& dims,
9797
TRITONSERVER_DataType dtype, TRITONSERVER_MemoryType memory_type,
9898
int64_t memory_type_id, void* memory_ptr, uint64_t byte_size,
99-
DLManagedTensor* dl_managed_tensor = nullptr,
100-
bi::managed_external_buffer::handle_t shm_handle = 0);
99+
DLManagedTensor* dl_managed_tensor = nullptr);
101100

102101
/// This constructor is used when
103102
/// loading the tensor from shared memory.

src/python.cc

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -307,9 +307,7 @@ class ModelInstanceState : public BackendModelInstance {
307307
bool& restart);
308308

309309
// Execute a BLS Request
310-
void ExecuteBLSRequest(
311-
std::unique_ptr<SharedMemoryManager>& shm_pool,
312-
bi::managed_external_buffer::handle_t message_offset);
310+
void ExecuteBLSRequest(bi::managed_external_buffer::handle_t message_offset);
313311

314312
// Cleanup BLS responses
315313
void CleanupBLSResponses();
@@ -959,14 +957,13 @@ ModelInstanceState::GetInputTensor(
959957

960958
void
961959
ModelInstanceState::ExecuteBLSRequest(
962-
std::unique_ptr<SharedMemoryManager>& shm_pool,
963960
bi::managed_external_buffer::handle_t message_offset)
964961
{
965962
ModelState* model_state = reinterpret_cast<ModelState*>(Model());
966963
auto request_executor =
967-
std::make_unique<RequestExecutor>(model_state->TritonServer());
964+
std::make_unique<RequestExecutor>(shm_pool_, model_state->TritonServer());
968965
std::unique_ptr<IPCMessage> ipc_message =
969-
IPCMessage::LoadFromSharedMemory(shm_pool, message_offset);
966+
IPCMessage::LoadFromSharedMemory(shm_pool_, message_offset);
970967
bool is_response_batch_set = false;
971968
std::unique_ptr<InferResponse> infer_response;
972969
ResponseBatch* response_batch;
@@ -1078,8 +1075,8 @@ ModelInstanceState::ExecuteBLSRequest(
10781075
}
10791076

10801077
if (pb_exception.what() != nullptr) {
1081-
infer_response = request_executor->Infer(
1082-
infer_request, shm_pool_, &inference_response);
1078+
infer_response =
1079+
request_executor->Infer(infer_request, &inference_response);
10831080

10841081
if (infer_response) {
10851082
infer_response->SaveToSharedMemory(shm_pool_);
@@ -1110,7 +1107,7 @@ ModelInstanceState::ExecuteBLSRequest(
11101107
if (is_response_batch_set) {
11111108
response_batch->has_error = true;
11121109
LOG_IF_EXCEPTION(
1113-
pb_error_message = PbString::Create(shm_pool, pb_exception.what()));
1110+
pb_error_message = PbString::Create(shm_pool_, pb_exception.what()));
11141111

11151112
if (pb_error_message != nullptr) {
11161113
response_batch->is_error_set = true;
@@ -1370,7 +1367,7 @@ ModelInstanceState::ProcessRequests(
13701367
// Launch the BLS request in a future.
13711368
bls_futures_.emplace_back(
13721369
std::async(std::launch::async, [this, current_message]() {
1373-
this->ExecuteBLSRequest(this->shm_pool_, current_message);
1370+
this->ExecuteBLSRequest(current_message);
13741371
}));
13751372

13761373
auto error = ReceiveMessageFromStub(response_message);

src/request_executor.cc

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
#include <future>
3030
#include "pb_utils.h"
31+
#include "scoped_defer.h"
3132
#include "triton/backend/backend_common.h"
3233
#include "triton/core/tritonserver.h"
3334

@@ -72,7 +73,10 @@ ResponseAlloc(
7273
void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
7374
int64_t* actual_memory_type_id)
7475
{
75-
SharedMemoryManager* shm_pool = reinterpret_cast<SharedMemoryManager*>(userp);
76+
std::unique_ptr<SharedMemoryManager> shm_pool(
77+
reinterpret_cast<SharedMemoryManager*>(userp));
78+
79+
ScopedDefer _([&shm_pool] { shm_pool.release(); });
7680
*actual_memory_type = preferred_memory_type;
7781
*actual_memory_type_id = preferred_memory_type_id;
7882

@@ -90,30 +94,20 @@ ResponseAlloc(
9094
case TRITONSERVER_MEMORY_CPU_PINNED: {
9195
*actual_memory_type = TRITONSERVER_MEMORY_CPU;
9296
*actual_memory_type_id = 0;
93-
bi::managed_external_buffer::handle_t tensor_handle;
9497
try {
95-
AllocatedSharedMemory<char> memory =
96-
shm_pool->Construct<char>(byte_size);
97-
*buffer = memory.data_.get();
98-
tensor_handle = memory.handle_;
99-
100-
// Release the ownership to avoid deallocation. The buffer
101-
// will be deallocated in ResponseRelease function.
102-
memory.data_.release();
98+
std::unique_ptr<PbMemory> pb_memory = PbMemory::Create(
99+
shm_pool, *actual_memory_type, *actual_memory_type_id, byte_size,
100+
nullptr /* data */, false /* copy_gpu */);
101+
*buffer = pb_memory->DataPtr();
102+
*buffer_userp = reinterpret_cast<void*>(pb_memory.get());
103+
pb_memory.release();
103104
}
104105
catch (const PythonBackendException& pb_exception) {
105106
TRITONSERVER_Error* err =
106107
CreateTritonErrorFromException(pb_exception);
107108
return err;
108109
}
109-
// Store the buffer offset in the userp; The userp is large enough to
110-
// hold the shared memory offset and the address of the Shared memory
111-
// manager
112-
AllocationInfo* allocation_info = new AllocationInfo;
113-
*buffer_userp = allocation_info;
114-
115-
allocation_info->handle_ = tensor_handle;
116-
allocation_info->shm_manager_ = shm_pool;
110+
117111
} break;
118112
#ifdef TRITON_ENABLE_GPU
119113
case TRITONSERVER_MEMORY_GPU: {
@@ -151,28 +145,12 @@ ResponseRelease(
151145
size_t byte_size, TRITONSERVER_MemoryType memory_type,
152146
int64_t memory_type_id)
153147
{
154-
switch (memory_type) {
155-
case TRITONSERVER_MEMORY_CPU:
156-
case TRITONSERVER_MEMORY_CPU_PINNED: {
157-
AllocationInfo* allocation_info =
158-
reinterpret_cast<AllocationInfo*>(buffer_userp);
159-
{
160-
// Load the data so that it is deallocated automatically.
161-
auto result = allocation_info->shm_manager_->Load<char>(
162-
allocation_info->handle_, true /* unsafe */);
163-
}
164-
165-
delete allocation_info;
166-
} break;
167-
case TRITONSERVER_MEMORY_GPU: {
168-
// No action is required for the GPU tensors.
169-
} break;
170-
}
171-
172148
return nullptr; // Success
173149
}
174150

175-
RequestExecutor::RequestExecutor(TRITONSERVER_Server* server) : server_(server)
151+
RequestExecutor::RequestExecutor(
152+
std::unique_ptr<SharedMemoryManager>& shm_pool, TRITONSERVER_Server* server)
153+
: server_(server), shm_pool_(shm_pool)
176154
{
177155
TRITONSERVER_ResponseAllocator* allocator;
178156
THROW_IF_TRITON_ERROR(TRITONSERVER_ResponseAllocatorNew(
@@ -183,7 +161,6 @@ RequestExecutor::RequestExecutor(TRITONSERVER_Server* server) : server_(server)
183161
std::unique_ptr<InferResponse>
184162
RequestExecutor::Infer(
185163
const std::shared_ptr<InferRequest>& infer_request,
186-
const std::unique_ptr<SharedMemoryManager>& shm_pool,
187164
TRITONSERVER_InferenceResponse** triton_response)
188165
{
189166
std::unique_ptr<InferResponse> infer_response;
@@ -247,7 +224,7 @@ RequestExecutor::Infer(
247224
std::future<TRITONSERVER_InferenceResponse*> completed = p->get_future();
248225

249226
THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback(
250-
irequest, response_allocator_, shm_pool.get(), InferResponseComplete,
227+
irequest, response_allocator_, shm_pool_.get(), InferResponseComplete,
251228
reinterpret_cast<void*>(p)));
252229

253230
THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync(
@@ -284,16 +261,21 @@ RequestExecutor::Infer(
284261
// userp is only set for the CPU tensors
285262
if (memory_type != TRITONSERVER_MEMORY_GPU) {
286263
if (byte_size != 0) {
287-
output_tensors.push_back(std::make_shared<PbTensor>(
264+
std::shared_ptr<PbTensor> pb_tensor = std::make_shared<PbTensor>(
288265
sname, dims_vector, datatype, memory_type, memory_type_id,
289266
const_cast<void*>(base), byte_size,
290-
nullptr /* DLManagedTensor */,
291-
*(reinterpret_cast<off_t*>(userp))));
267+
nullptr /* DLManagedTensor */);
268+
269+
// Load the data so that it is deallocated automatically.
270+
std::unique_ptr<PbMemory> pb_memory(
271+
reinterpret_cast<PbMemory*>(userp));
272+
pb_tensor->SetMemory(std::move(pb_memory));
273+
output_tensors.push_back(pb_tensor);
292274
} else {
293275
output_tensors.push_back(std::make_shared<PbTensor>(
294276
sname, dims_vector, datatype, memory_type, memory_type_id,
295277
const_cast<void*>(base), byte_size,
296-
nullptr /* DLManagedTensor */, 0 /* shared memory offest */));
278+
nullptr /* DLManagedTensor */));
297279
}
298280
} else {
299281
output_tensors.push_back(std::make_shared<PbTensor>(

src/request_executor.h

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,19 @@ namespace triton { namespace backend { namespace python {
3232
TRITONSERVER_Error* CreateTritonErrorFromException(
3333
const PythonBackendException& pb_exception);
3434

35-
36-
struct AllocationInfo {
37-
bi::managed_external_buffer::handle_t handle_;
38-
SharedMemoryManager* shm_manager_;
39-
};
40-
4135
class RequestExecutor {
4236
TRITONSERVER_ResponseAllocator* response_allocator_ = nullptr;
4337
TRITONSERVER_Server* server_;
38+
std::unique_ptr<SharedMemoryManager>& shm_pool_;
4439

4540
public:
4641
std::unique_ptr<InferResponse> Infer(
4742
const std::shared_ptr<InferRequest>& infer_request,
48-
const std::unique_ptr<SharedMemoryManager>& shm_pool,
4943
TRITONSERVER_InferenceResponse** response);
50-
RequestExecutor(TRITONSERVER_Server* server);
44+
RequestExecutor(
45+
std::unique_ptr<SharedMemoryManager>& shm_pool,
46+
TRITONSERVER_Server* server);
47+
5148
~RequestExecutor();
5249
};
5350

0 commit comments

Comments
 (0)