Skip to content

Commit 0a54e59

Browse files
authored
Fix error handling for GPU tensors (triton-inference-server#249)
* Fix error handling for GPU tensors * Fix GPU buffer handling * Review edit * Fix for dynamically batched responses with GPU tensor * Review edits * Fix unused i variable for GPU=OFF * Review comments * Review edit
1 parent 637c7e3 commit 0a54e59

File tree

10 files changed

+280
-172
lines changed

10 files changed

+280
-172
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ set(
163163
src/metric.cc
164164
src/metric_family.h
165165
src/metric_family.cc
166+
src/gpu_buffers.cc
167+
src/gpu_buffers.h
166168
)
167169

168170
set(

src/gpu_buffers.cc

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
#include "gpu_buffers.h"
28+
#include "pb_string.h"
29+
30+
namespace triton { namespace backend { namespace python {
31+
GPUBuffersHelper::GPUBuffersHelper()
32+
{
33+
completed_ = false;
34+
}
35+
36+
void
37+
GPUBuffersHelper::AddBuffer(const bi::managed_external_buffer::handle_t& handle)
38+
{
39+
if (completed_) {
40+
throw PythonBackendException(
41+
"It is not possible to add buffers after 'Complete' has been called on "
42+
"a GPUBuffersHelper.");
43+
}
44+
45+
buffers_.emplace_back(handle);
46+
}
47+
48+
void
49+
GPUBuffersHelper::SetError(
50+
std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& error)
51+
{
52+
error_shm_ = PbString::Create(shm_pool, error);
53+
}
54+
55+
void
56+
GPUBuffersHelper::Complete(std::unique_ptr<SharedMemoryManager>& shm_pool)
57+
{
58+
if (completed_) {
59+
throw PythonBackendException(
60+
"Complete has already been called. Complete should only be called "
61+
"once.");
62+
}
63+
gpu_buffers_shm_ = shm_pool->Construct<GPUBuffersShm>();
64+
if (!error_shm_) {
65+
buffers_handle_shm_ =
66+
shm_pool->Construct<bi::managed_external_buffer::handle_t>(
67+
buffers_.size());
68+
gpu_buffers_shm_.data_->buffer_count = buffers_.size();
69+
gpu_buffers_shm_.data_->success = true;
70+
gpu_buffers_shm_.data_->buffers = buffers_handle_shm_.handle_;
71+
for (size_t i = 0; i < buffers_.size(); ++i) {
72+
buffers_handle_shm_.data_.get()[i] = buffers_[i];
73+
}
74+
} else {
75+
gpu_buffers_shm_.data_->success = false;
76+
gpu_buffers_shm_.data_->error = error_shm_->ShmHandle();
77+
}
78+
completed_ = true;
79+
}
80+
81+
82+
bi::managed_external_buffer::handle_t
83+
GPUBuffersHelper::ShmHandle()
84+
{
85+
return gpu_buffers_shm_.handle_;
86+
}
87+
88+
}}} // namespace triton::backend::python

src/gpu_buffers.h

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
#pragma once
28+
29+
#include "pb_string.h"
30+
#include "pb_utils.h"
31+
#include "scoped_defer.h"
32+
33+
namespace triton { namespace backend { namespace python {
34+
35+
/// \param success indicating whether the process of fetching the GPU buffers
36+
/// was successful.
37+
/// \param error if success is equal to false, the error object will be set.
38+
/// \param buffers list of buffers elements.
39+
/// \param buffer_count the number of buffers.
40+
struct GPUBuffersShm {
41+
bool success;
42+
bi::managed_external_buffer::handle_t error;
43+
bi::managed_external_buffer::handle_t buffers;
44+
uint32_t buffer_count;
45+
};
46+
47+
/// Helper class to facilitate transfer of metadata associated
48+
/// the GPU buffers in shared memory.
49+
class GPUBuffersHelper {
50+
public:
51+
GPUBuffersHelper();
52+
void AddBuffer(const bi::managed_external_buffer::handle_t& handle);
53+
void Complete(std::unique_ptr<SharedMemoryManager>& shm_pool);
54+
void SetError(
55+
std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& error);
56+
bi::managed_external_buffer::handle_t ShmHandle();
57+
58+
private:
59+
AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm_;
60+
std::vector<bi::managed_external_buffer::handle_t> buffers_;
61+
AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
62+
buffers_handle_shm_;
63+
std::unique_ptr<PbString> error_shm_;
64+
bool completed_;
65+
};
66+
67+
}}}; // namespace triton::backend::python

src/infer_request.cc

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
#include <boost/interprocess/sync/scoped_lock.hpp>
3030

31+
#include "gpu_buffers.h"
3132
#include "pb_utils.h"
3233
#include "scoped_defer.h"
3334
#ifdef TRITON_PB_STUB
@@ -481,11 +482,19 @@ InferRequest::Exec(const bool is_decoupled)
481482
// Additional round trip required for asking the stub process
482483
// to fill in the GPU tensor buffers
483484
if (has_gpu_tensor) {
485+
AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm =
486+
shm_pool->Load<GPUBuffersShm>(
487+
request_batch_shm_ptr->gpu_buffers_handle);
484488
AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
485489
gpu_buffers_handle =
486490
shm_pool->Load<bi::managed_external_buffer::handle_t>(
487-
request_batch_shm_ptr->gpu_buffers_handle);
491+
gpu_buffers_shm.data_->buffers);
488492
try {
493+
if (!gpu_buffers_shm.data_->success) {
494+
std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
495+
shm_pool, gpu_buffers_shm.data_->error);
496+
throw PythonBackendException(error->String());
497+
}
489498
#ifdef TRITON_ENABLE_GPU
490499
size_t i = 0;
491500
for (auto& input_tensor : this->Inputs()) {

src/infer_response.cc

Lines changed: 20 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -201,64 +201,50 @@ InferResponse::IsLastResponse()
201201
}
202202

203203
#ifndef TRITON_PB_STUB
204-
std::shared_ptr<TRITONSERVER_Error*>
204+
void
205205
InferResponse::Send(
206-
TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream,
206+
TRITONBACKEND_Response* response, void* cuda_stream,
207207
bool& requires_deferred_callback, const uint32_t flags,
208208
std::unique_ptr<SharedMemoryManager>& shm_pool,
209+
GPUBuffersHelper& gpu_buffer_helper,
209210
std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
210-
const std::set<std::string>& requested_output_names,
211-
TRITONBACKEND_Response* response)
211+
const std::set<std::string>& requested_output_names)
212212
{
213213
std::shared_ptr<TRITONSERVER_Error*> response_error =
214214
WrapTritonErrorInSharedPtr(nullptr);
215215
std::unique_ptr<ScopedDefer> response_error_handling;
216216
requires_deferred_callback = false;
217217

218-
// Should only destruct the response factory whenever a response factory is
219-
// being created.
220-
bool destruct_response_factor = (response == nullptr);
221-
222-
if (response == nullptr) {
223-
SET_ERROR_AND_RETURN(
224-
response_error,
225-
TRITONBACKEND_ResponseNewFromFactory(&response, response_factory));
226-
}
227-
228218
// This lambda expression will be called when this function exits, if the
229219
// inference response doesn't have any GPU tensors. Otherwise, it will be
230220
// called when the object is destructed or DeferredSendCallback is called.
231-
response_error_handling = std::make_unique<ScopedDefer>(
232-
[response, response_error, flags, response_factory,
233-
destruct_response_factor] {
221+
response_error_handling =
222+
std::make_unique<ScopedDefer>([response, response_error, flags] {
234223
if (response != nullptr) {
235224
LOG_IF_ERROR(
236225
TRITONBACKEND_ResponseSend(response, flags, *response_error),
237226
"failed to send the response.");
238-
if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL &&
239-
destruct_response_factor) {
240-
std::unique_ptr<
241-
TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter>
242-
response_factory_ptr(
243-
reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
244-
response_factory));
245-
}
246227
}
247228
});
248229

249230
// Moves the response sending callback so that it is not called until the stub
250231
// process fills in the GPU buffers.
251-
ScopedDefer deferred_task(
252-
[this, &requires_deferred_callback, &response_error_handling] {
253-
if (requires_deferred_callback) {
254-
deferred_send_callback_ = std::move(response_error_handling);
255-
}
256-
});
232+
ScopedDefer deferred_task([this, &requires_deferred_callback,
233+
&response_error_handling, &gpu_buffer_helper,
234+
response_error, &shm_pool] {
235+
if (*response_error != nullptr) {
236+
gpu_buffer_helper.SetError(
237+
shm_pool, TRITONSERVER_ErrorMessage(*response_error));
238+
}
239+
if (requires_deferred_callback) {
240+
deferred_send_callback_ = std::move(response_error_handling);
241+
}
242+
});
257243

258244
if (HasError()) {
259245
*response_error = TRITONSERVER_ErrorNew(
260246
TRITONSERVER_ERROR_INTERNAL, Error()->Message().c_str());
261-
return nullptr;
247+
return;
262248
}
263249

264250
bool cuda_copy = false;
@@ -322,6 +308,7 @@ InferResponse::Send(
322308
output_tensor->ByteSize(), reinterpret_cast<char*>(buffer),
323309
true /* copy_gpu */));
324310
}
311+
gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle());
325312
output_buffers.push_back({std::move(output_buffer), buffer});
326313
#endif
327314
}
@@ -336,6 +323,7 @@ InferResponse::Send(
336323
shm_pool, actual_memory_type, actual_memory_type_id,
337324
output_tensor->ByteSize(), nullptr /* data ptr */));
338325

326+
gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle());
339327
output_buffers.push_back({std::move(output_buffer), buffer});
340328
}
341329

@@ -357,8 +345,6 @@ InferResponse::Send(
357345
cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream));
358346
}
359347
#endif // TRITON_ENABLE_GPU
360-
361-
return response_error;
362348
}
363349
#endif
364350

src/infer_response.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#pragma once
2828

2929
#include <future>
30+
#include "gpu_buffers.h"
3031
#include "pb_error.h"
3132
#include "pb_tensor.h"
3233
#include "pb_utils.h"
@@ -49,7 +50,7 @@ struct ResponseShm {
4950
TRITONSERVER_Error* raasnie_err__ = (X); \
5051
if (raasnie_err__ != nullptr) { \
5152
*E = raasnie_err__; \
52-
return E; \
53+
return; \
5354
} \
5455
} while (false)
5556

@@ -62,7 +63,7 @@ struct ResponseShm {
6263
TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \
6364
TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \
6465
*E = rarie_err__; \
65-
return E; \
66+
return; \
6667
} \
6768
} while (false)
6869

@@ -96,13 +97,13 @@ class InferResponse {
9697
/// response needs to be done in two step. The boolean
9798
/// 'requires_deferred_callback' indicates whether DeferredSendCallback method
9899
/// should be called or not.
99-
std::shared_ptr<TRITONSERVER_Error*> Send(
100-
TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream,
100+
void Send(
101+
TRITONBACKEND_Response* response, void* cuda_stream,
101102
bool& requires_deferred_callback, const uint32_t flags,
102103
std::unique_ptr<SharedMemoryManager>& shm_pool,
104+
GPUBuffersHelper& gpu_buffer_helper,
103105
std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
104-
const std::set<std::string>& requested_output_names = {},
105-
TRITONBACKEND_Response* response = nullptr);
106+
const std::set<std::string>& requested_output_names = {});
106107

107108
void DeferredSendCallback();
108109
#endif

0 commit comments

Comments
 (0)