From a4f3138f1b2734e3fd82def3c539e7a911b81650 Mon Sep 17 00:00:00 2001 From: Hemant Jain Date: Tue, 26 Apr 2022 08:30:40 -0700 Subject: [PATCH 01/76] Fix libc10 missing issue on jetson (#56) (#58) --- CMakeLists.txt | 124 ++++++++++++++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 47 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 076b095..909148f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,60 +125,67 @@ endif() # TRITON_ENABLE_GPU # configure_file(src/libtriton_pytorch.ldscript libtriton_pytorch.ldscript COPYONLY) -if (${TRITON_PYTORCH_DOCKER_BUILD}) - if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") - set(LIBS_ARCH "aarch64") - set(CONDA_LIBS - "libopenblas.so.0" - ) - else() - set(LIBS_ARCH "x86_64") - set(CONDA_LIBS - "libmkl_core.so" - "libmkl_gnu_thread.so" - "libmkl_intel_lp64.so" - "libmkl_intel_thread.so" - "libmkl_def.so" - "libmkl_vml_def.so" - "libmkl_rt.so" - "libmkl_avx2.so" - "libmkl_avx512.so" - "libmkl_sequential.so" - "libomp.so" - ) - endif() +set(PT_LIBS + "libc10.so" + "libc10_cuda.so" + "libtorch.so" + "libtorch_cpu.so" + "libtorch_cuda.so" + "libtorch_global_deps.so" +) + +if (${TRITON_PYTORCH_ENABLE_TORCHVISION}) set(PT_LIBS - ${CONDA_LIBS} - "libc10.so" - "libc10_cuda.so" - "libtorch.so" - "libtorch_cpu.so" - "libtorch_cuda.so" - "libtorch_global_deps.so" + ${PT_LIBS} "libtorchvision.so" ) - set(OPENCV_LIBS - "libopencv_video.so" - "libopencv_videoio.so" - "libopencv_highgui.so" - "libopencv_imgcodecs.so" - "libopencv_imgproc.so" - "libopencv_core.so" - "libpng16.so" +endif() # TRITON_PYTORCH_ENABLE_TORCHVISION + +if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) + set(PT_LIBS + ${PT_LIBS} + "libtorchtrt_runtime.so" + ) +endif() # TRITON_PYTORCH_ENABLE_TORCHTRT + +if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(LIBS_ARCH "aarch64") + set(CONDA_LIBS + "libopenblas.so.0" + ) +else() + set(LIBS_ARCH "x86_64") + set(CONDA_LIBS + "libmkl_core.so" + "libmkl_gnu_thread.so" + "libmkl_intel_lp64.so" + "libmkl_intel_thread.so" + "libmkl_def.so" + "libmkl_vml_def.so" + "libmkl_rt.so" + "libmkl_avx2.so" + "libmkl_avx512.so" + "libmkl_sequential.so" + "libomp.so" ) +endif() +set(OPENCV_LIBS + "libopencv_video.so" + "libopencv_videoio.so" + "libopencv_highgui.so" + "libopencv_imgcodecs.so" + "libopencv_imgproc.so" + "libopencv_core.so" + "libpng16.so" +) +if (${TRITON_PYTORCH_DOCKER_BUILD}) string(REPLACE ";" " " CONDA_LIBS_STR "${CONDA_LIBS}") - if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) - set(PT_LIBS - ${PT_LIBS} - "libtorchtrt_runtime.so" - ) - endif() # TRITON_PYTORCH_ENABLE_TORCHTRT - add_custom_command( OUTPUT ${PT_LIBS} + ${CONDA_LIBS} ${OPENCV_LIBS} LICENSE.pytorch include/torch @@ -218,7 +225,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}" VERBATIM ) - add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${OPENCV_LIBS}) + add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS}) add_library(ptlib SHARED IMPORTED GLOBAL) add_dependencies(ptlib ptlib_target) @@ -384,7 +391,7 @@ install( if (${TRITON_PYTORCH_DOCKER_BUILD}) set(PT_LIB_PATHS "") - FOREACH(plib ${PT_LIBS} ${OPENCV_LIBS}) + FOREACH(plib ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS}) set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}") ENDFOREACH(plib) @@ -403,7 +410,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) ) endif() # TRITON_PYTORCH_ENABLE_TORCHTRT - FOREACH(plib ${PT_LIBS} ${OPENCV_LIBS}) + FOREACH(plib ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS}) install( CODE "EXECUTE_PROCESS( @@ -433,6 +440,29 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) message(FATAL_ERROR \"FAILED: to create links\") endif()" ) +else() + FOREACH(plib ${PT_LIBS}) + set(PT_LIB_PATHS ${PT_LIB_PATHS} "${TRITON_PYTORCH_LIB_PATHS}/${plib}") + ENDFOREACH(plib) + + install( + FILES + ${PT_LIB_PATHS} + DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch + ) + + FOREACH(plib ${PT_LIBS}) + install( + CODE + "EXECUTE_PROCESS( + COMMAND patchelf --set-rpath \$ORIGIN ${plib} + RESULT_VARIABLE PATCHELF_STATUS + WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) + if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) + message(FATAL_ERROR \"FAILED: to run patchelf\") + endif()" + ) + ENDFOREACH(plib) endif() # TRITON_PYTORCH_DOCKER_BUILD install( From 510cc49df8012bee78839e976ff79a0909992cdc Mon Sep 17 00:00:00 2001 From: Hemant Jain Date: Wed, 4 May 2022 13:54:15 -0700 Subject: [PATCH 02/76] Fix typos + cleanup ReadMe (#62) --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 0eb8388..96282ce 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ $ make install ``` The following required Triton repositories will be pulled and used in -the build. By default the "main" branch/tag will be used for each repo +the build. By default, the "main" branch/tag will be used for each repo but the listed CMake argument can be used to override. * triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag] @@ -100,10 +100,10 @@ $ make install ### Parameters Triton exposes some flags to control the execution mode of the TorchScript models through -the Parameters section of the model's 'config.pbtxt' file. +the Parameters section of the model's `config.pbtxt` file. * `DISABLE_OPTIMIZED_EXECUTION`: Boolean flag to disable the optimized execution -of TorchScript models. By default the optimized execuiton is always enabled. +of TorchScript models. By default, the optimized execution is always enabled. The initial calls to a loaded TorchScript model take extremely long. Due to this longer model warmup [issue](https://github.com/pytorch/pytorch/issues/57894), Triton also allows @@ -117,13 +117,13 @@ The section of model config file specifying this parameter will look like: parameters: { key: "DISABLE_OPTIMIZED_EXECUTION" value: { - string_value:"true" + string_value: "true" } } ``` * `INFERENCE_MODE`: Boolean flag to enable the Inference Mode execution -of TorchScript models. By default the inference mode is disabled. +of TorchScript models. By default, the inference mode is disabled. [InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new RAII guard analogous to NoGradMode to be used when you are certain your operations @@ -139,14 +139,14 @@ The section of model config file specifying this parameter will look like: parameters: { key: "INFERENCE_MODE" value: { - string_value:"true" + string_value: "true" } } ``` * `ENABLE_NVFUSER`: Boolean flag to enable the NvFuser (CUDA Graph Fuser) optimization for TorchScript models. If not specified, the -default pytorch fuser is used. If `ENABLE_NVFUSER` is specified, the +default PyTorch fuser is used. If `ENABLE_NVFUSER` is specified, the `ENABLE_TENSOR_FUSER` configuration (see below) is ignored. Please note that in some models generated using trace in old PyTorch versions might not work @@ -159,7 +159,7 @@ The section of model config file specifying this parameter will look like: parameters: { key: "ENABLE_NVFUSER" value: { - string_value:"true" + string_value: "true" } } ``` @@ -174,7 +174,7 @@ The section of model config file specifying this parameter will look like: parameters: { key: "ENABLE_WEIGHT_SHARING" value: { - string_value:"true" + string_value: "true" } } ``` @@ -191,9 +191,9 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by ### Important Note -* The execution of pytorch model on GPU is asynchronous in nature. See +* The execution of PyTorch model on GPU is asynchronous in nature. See [here](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution) - for more details. Consequently, an error in pytorch model execution may + for more details. Consequently, an error in PyTorch model execution may be raised during the next few inference requests to the server. Setting environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will help in correctly debugging failing cases by forcing synchronous execution. @@ -201,8 +201,8 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by state and a restart of the server may be required to continue serving successfully. -* Multiple instances of the pytorch model on GPU do not always - increase performance. Due to thread specific caching in pytorch, using +* Multiple instances of the PyTorch model on GPU do not always + increase performance. Due to thread specific caching in PyTorch, using multiple instances of the model interact negatively. See [here](https://github.com/pytorch/pytorch/issues/27902) for more details. Setting the parameter `DISABLE_OPTIMIZED_EXECUTION` to "true" in the model From ab59a37ebecc3d921b9d7bc5a00fecf4defcc529 Mon Sep 17 00:00:00 2001 From: Hemant Jain Date: Mon, 9 May 2022 09:56:26 -0700 Subject: [PATCH 03/76] Add support for String input/output (#60) * x * Fix string input tensor support * Fix string list creation * Add support for String output * cleanup * review edits * Readme fixes * review edits - throw error during model loading * follow up edits * Fix typo in check * Use verbose checking of output type to ensure appropriate error message is returned during model execution * cleanup * Wrap raw pointer to allow auto freeing of memory * follow up review edits --- README.md | 8 + src/libtorch.cc | 531 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 471 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index 96282ce..4b41c1a 100644 --- a/README.md +++ b/README.md @@ -208,3 +208,11 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by Setting the parameter `DISABLE_OPTIMIZED_EXECUTION` to "true" in the model configuration may help in some cases to avoid these negative interactions due to model specific caching and increase multiple instance performance. + +* PyTorch does not support Tensor of Strings but it does support models that accept +a List of Strings as input(s) / produces a List of String as output(s). For these models +Triton allows users to pass String input(s)/recieve String output(s) using the String +datatype. As a limitation of using List instead of Tensor for String I/O, only for +1-dimensional input(s)/output(s) are supported for I/O of String type. +Batching is not allowed for PyTorch models with String I/O. For these models, +the user must specify `max_batch_size: 0` in the configuration. diff --git a/src/libtorch.cc b/src/libtorch.cc index 6934a6c..e22b55a 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -461,7 +461,7 @@ class ModelInstanceState : public BackendModelInstance { std::vector* responses, const uint32_t response_count, std::vector* input_tensors, - std::vector* output_tensors); + std::vector* output_tensors); TRITONSERVER_Error* SetInputTensors( size_t total_batch_size, TRITONBACKEND_Request** requests, const uint32_t request_count, @@ -471,7 +471,7 @@ class ModelInstanceState : public BackendModelInstance { std::vector* input_memories, bool* cuda_copy); TRITONSERVER_Error* ReadOutputTensors( size_t total_batch_size, const std::vector& output_names, - const std::vector& output_tensors, + const std::vector& output_tensors, TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector* responses, uint64_t* compute_end_ns); @@ -689,7 +689,8 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) // Return error if all inputs are not of type Tensor for (size_t i = start_idx; i < arguments.size(); i++) { - if (arguments.at(i).type()->kind() != c10::TypeKind::TensorType) { + if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) && + (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("An input of type '") + arguments.at(i).type()->str() + @@ -725,6 +726,8 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) "specified."); } + bool supports_batching = model_state_->MaxBatchSize() > 0; + for (size_t i = 0; i < ios.ArraySize(); i++) { triton::common::TritonJson::Value io; RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); @@ -766,13 +769,37 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) std::string io_dtype; RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); const auto pr = ModelConfigDataTypeToTorchType(io_dtype); - if (!pr.first) { + if (!pr.first && (io_dtype != "TYPE_STRING")) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, ("unsupported datatype " + io_dtype + " for input '" + io_name + "' for model '" + model_state_->Name() + "'") .c_str()); } + + // Validate shape for String inputs. Only allow 1 dimension and no + // batching. + if (io_dtype == "TYPE_STRING") { + // If a reshape is provided for the input then use that when + // validating the model shapes. + std::vector dims; + triton::common::TritonJson::Value reshape; + if (io.Find("reshape", &reshape)) { + RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); + } else { + RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); + } + + if ((dims.size() > 1) || supports_batching) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as input for " + "'" + + std::string(io_name) + "' for model '" + model_state_->Name() + + "'") + .c_str()); + } + } } return nullptr; // success @@ -793,6 +820,8 @@ ModelInstanceState::ValidateOutputs() "specified."); } + const bool supports_batching = model_state_->MaxBatchSize() > 0; + for (size_t i = 0; i < ios.ArraySize(); i++) { triton::common::TritonJson::Value io; RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); @@ -819,13 +848,38 @@ ModelInstanceState::ValidateOutputs() std::string io_dtype; RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); const auto pr = ModelConfigDataTypeToTorchType(io_dtype); - if (!pr.first) { + if (!pr.first && (io_dtype != "TYPE_STRING")) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, ("unsupported datatype " + io_dtype + " for output '" + io_name + "' for model '" + model_state_->Name() + "'") .c_str()); } + + // Validate shape for String outputs. Only allow 1 dimension and no + // batching. + if (io_dtype == "TYPE_STRING") { + // If a reshape is provided for the output then use that when + // validating the model shapes. + std::vector dims; + triton::common::TritonJson::Value reshape; + if (io.Find("reshape", &reshape)) { + RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); + } else { + RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); + } + + if ((dims.size() > 1) || supports_batching) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as output for " + "'" + + std::string(io_name) + "' for model '" + model_state_->Name() + + "'") + .c_str()); + } + } + output_index_map_[io_name] = op_index; output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second); } @@ -965,7 +1019,7 @@ ModelInstanceState::ProcessRequests( // 'output_tensors' are parallel vectors and so must be kept in // sync. std::vector output_names; - std::vector output_tensors; + std::vector output_tensors; if (!all_response_failed) { triton::common::TritonJson::Value ios; TRITONSERVER_Error* err = @@ -1036,8 +1090,7 @@ ModelInstanceState::ProcessRequests( std::string( "The output " + std::string(name) + " in the model configuration refers to an output index " - "which" - " doesn't exist. This model has " + + "which doesn't exist. This model has " + std::to_string(max_index + 1) + " outputs") .c_str())); invalid_index = true; @@ -1104,7 +1157,7 @@ ModelInstanceState::Execute( std::vector* responses, const uint32_t response_count, std::vector* input_tensors, - std::vector* output_tensors) + std::vector* output_tensors) { torch::jit::IValue model_outputs_; @@ -1169,20 +1222,38 @@ ModelInstanceState::Execute( if (model_outputs_.isTuple()) { auto model_outputs_tuple = model_outputs_.toTuple(); + size_t op_index = 0; for (auto& m_op : model_outputs_tuple->elements()) { - output_tensors->push_back(m_op.toTensor()); - } - } else { - try { - auto model_output_tensor = model_outputs_.toTensor(); - output_tensors->push_back(model_output_tensor); + if (m_op.isList()) { + auto list_output = m_op.toList(); + if (list_output.elementType()->kind() != c10::TypeKind::StringType) { + throw std::invalid_argument( + "output at index " + std::to_string(op_index) + + " must be of type Tensor or List[str], recieved List[" + + list_output.elementType()->str() + "]"); + } + output_tensors->push_back(m_op); + } else { + auto tensor_output = m_op.toTensor(); + output_tensors->push_back(m_op); + } + op_index++; } - catch (std::exception& exx) { + } else if (model_outputs_.isTensor()) { + output_tensors->push_back(model_outputs_); + } else if (model_outputs_.isList()) { + auto list_output = model_outputs_.toList(); + if (list_output.elementType()->kind() != c10::TypeKind::StringType) { throw std::invalid_argument( - "Output of torch model should be tensor or a tuple of tensors, not " - "a list / dictionary of tensors or a scalar: " + - std::string(exx.what())); + "output must be of type Tensor or List[str], recieved List[" + + list_output.elementType()->str() + "]"); } + output_tensors->push_back(model_outputs_); + } else { + throw std::invalid_argument( + "output must be of type Tensor, List[str] or Tuple " + "containing one of these two types. It should not be a List / " + "Dictionary of Tensors or a Scalar"); } } catch (std::exception& ex) { @@ -1194,6 +1265,244 @@ ModelInstanceState::Execute( } } +// This function will return a tensor's contents as a contiguous +// chunk in system memory. In some cases this will require copying the data. +// If that happens, 'contiguous_buffer' will be set to hold the contiguous +// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is +// conducted. The data copy can be avoided if the input is already in +// a contiguous chunk and the input is located in memory type and id +// specified. +TRITONSERVER_Error* +GetContiguousInputContent( + TRITONBACKEND_Input* rinput, const uint32_t buffer_count, + const char** content, size_t* content_byte_size, + std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy) +{ + *cuda_copy = false; + + // Check input buffers to see if data copy is necessary + size_t chunk_count = 0; + bool type_mismatch = false; + uint64_t total_byte_size = 0; + for (size_t idx = 0; idx < buffer_count; ++idx) { + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + size_t src_byte_size; + const void* src_ptr; + + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, idx, &src_ptr, &src_byte_size, &src_memory_type, + &src_memory_type_id)); + + if (src_ptr != nullptr) { + chunk_count++; + total_byte_size += src_byte_size; + type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU); + } + } + + if (chunk_count == 0) { + *content = nullptr; + *content_byte_size = 0; + } else if ((chunk_count == 1) && !type_mismatch) { + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, 0, (const void**)content, content_byte_size, &src_memory_type, + &src_memory_type_id)); + } else { + contiguous_buffer->resize(total_byte_size); + + size_t offset = 0; + for (size_t i = 0; i < chunk_count; i++) { + bool cuda_used; + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + size_t src_byte_size; + const void* src_ptr; + + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, i, &src_ptr, &src_byte_size, &src_memory_type, + &src_memory_type_id)); + RETURN_IF_ERROR(CopyBuffer( + "Contiguous input", src_memory_type, src_memory_type_id, + TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr, + contiguous_buffer->data() + offset, stream, &cuda_used)); + *cuda_copy |= cuda_used; + offset += src_byte_size; + } + + *content = contiguous_buffer->data(); + *content_byte_size = total_byte_size; + } + + return nullptr; // success +} + +void +FillStringTensor( + torch::List* input_list, const size_t idx, const size_t cnt) +{ + for (size_t c = 0; c < cnt; ++c) { + input_list->push_back(""); + } +} + +bool +SetStringInputTensor( + torch::List* input_list, TRITONBACKEND_Input* input, + const char* name, const uint32_t buffer_count, + const size_t request_element_cnt, const size_t tensor_offset, + TRITONBACKEND_Response** response, cudaStream_t stream, + const char* host_policy_name) +{ + bool cuda_copy = false; + size_t element_idx = 0; + + // For string data type, we always need to have the data on CPU so + // that we can read string length and construct the string + // properly. So if the request's input tensor is not in CPU need to + // copy it there. + const char* content = nullptr; + size_t content_byte_size = 0; + + std::vector contiguous_buffer; + auto err = GetContiguousInputContent( + input, buffer_count, &content, &content_byte_size, &contiguous_buffer, + stream, &cuda_copy); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + FillStringTensor( + input_list, tensor_offset + element_idx, + request_element_cnt - element_idx); + return cuda_copy; + } + +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream); + cuda_copy = false; + } +#endif // TRITON_ENABLE_GPU + + // Parse content and assign to 'tensor'. Each string in 'content' + // is a 4-byte length followed by the string itself with no + // null-terminator. + while (content_byte_size >= sizeof(uint32_t)) { + if (element_idx >= request_element_cnt) { + RESPOND_AND_SET_NULL_IF_ERROR( + response, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + std::string( + "unexpected number of string elements " + + std::to_string(element_idx + 1) + " for inference input '" + + name + "', expecting " + std::to_string(request_element_cnt)) + .c_str())); + FillStringTensor( + input_list, tensor_offset + element_idx, + request_element_cnt - element_idx); + return cuda_copy; + } + + const uint32_t len = *(reinterpret_cast(content)); + content += sizeof(uint32_t); + content_byte_size -= sizeof(uint32_t); + + if (content_byte_size < len) { + RESPOND_AND_SET_NULL_IF_ERROR( + response, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + std::string( + "incomplete string data for inference input '" + + std::string(name) + "', expecting string of length " + + std::to_string(len) + " but only " + + std::to_string(content_byte_size) + " bytes available") + .c_str())); + FillStringTensor( + input_list, tensor_offset + element_idx, + request_element_cnt - element_idx); + return cuda_copy; + } + + // Set string value + input_list->push_back(std::string(content, len)); + + content += len; + content_byte_size -= len; + element_idx++; + } + + if ((*response != nullptr) && (element_idx != request_element_cnt)) { + RESPOND_AND_SET_NULL_IF_ERROR( + response, TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "expected " + std::to_string(request_element_cnt) + + " strings for inference input '" + name + "', got " + + std::to_string(element_idx)) + .c_str())); + FillStringTensor( + input_list, tensor_offset + element_idx, + request_element_cnt - element_idx); + } + + return cuda_copy; +} + +bool +SetStringOutputBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, const size_t tensor_element_count, + const size_t tensor_offset, cudaStream_t stream, std::string* serialized) +{ + bool cuda_copy = false; + + // Serialize the output tensor strings. Each string is serialized as + // a 4-byte length followed by the string itself with no + // null-terminator. + serialized->clear(); + for (size_t e = 0; e < tensor_element_count; ++e) { + std::string str = tensor->get(e).to(); + const char* cstr = str.c_str(); + size_t len = str.length(); + serialized->append(reinterpret_cast(&len), sizeof(uint32_t)); + if (len > 0) { + serialized->append(cstr, len); + } + } + + // Allocate a buffer large enough to hold the serialized tensor. + TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; + int64_t actual_memory_type_id = 0; + + void* buffer; + auto err = TRITONBACKEND_OutputBuffer( + response_output, &buffer, serialized->size(), &actual_memory_type, + &actual_memory_type_id); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + + // Copy the serialized tensor into the allocated buffer. + bool cuda_used = false; + err = CopyBuffer( + "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */, + 0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id, + serialized->size(), reinterpret_cast(serialized->c_str()), + buffer, stream, &cuda_used); + cuda_copy |= cuda_used; + + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + + return cuda_copy; +} + TRITONSERVER_Error* ModelInstanceState::SetInputTensors( size_t total_batch_size, TRITONBACKEND_Request** requests, @@ -1252,17 +1561,59 @@ ModelInstanceState::SetInputTensors( input_name, nullptr, 0, alloc_perference, &input_buffer, &batchn_byte_size, &memory_type, &memory_type_id)); - // Create Torch tenor + // Create Torch tensor const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype); torch::TensorOptions options{torch_dtype.second}; auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU) ? options.device(torch::kCUDA, device_.index()) : options.device(torch::kCPU); - // Remove constness to align with the signature of torch::from_blob() - torch::Tensor input_tensor = torch::from_blob( - const_cast(input_buffer), batchn_shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; + + if (input_datatype == TRITONSERVER_TYPE_BYTES) { + if (batchn_shape.size() != 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, ("Triton only supports 1 dimensional " + "List of string as input for '" + + std::string(input_name) + "'") + .c_str()); + } + + // Create the PyTorch list to hold the strings. + torch::List input_list; + input_list.reserve(batchn_shape[0]); + + size_t tensor_offset = 0; + + for (size_t idx = 0; idx < request_count; idx++) { + TRITONBACKEND_Input* input; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); + const int64_t* shape; + uint32_t dims_count; + uint32_t buffer_count; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_InputPropertiesForHostPolicy( + input, HostPolicyName().c_str(), nullptr, nullptr, &shape, + &dims_count, nullptr, &buffer_count)); + + const int64_t batch_element_cnt = GetElementCount(shape, dims_count); + + *cuda_copy |= SetStringInputTensor( + &input_list, input, input_name, buffer_count, batch_element_cnt, + tensor_offset, &((*responses)[idx]), CudaStream(), + HostPolicyName().c_str()); + tensor_offset += batch_element_cnt; + } + + (*input_tensors)[input_index_map_[input_name]] = input_list; + } else { + // Remove constness to align with the signature of torch::from_blob() + torch::Tensor input_tensor = torch::from_blob( + const_cast(input_buffer), batchn_shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } } // Finalize... @@ -1274,7 +1625,7 @@ ModelInstanceState::SetInputTensors( TRITONSERVER_Error* ModelInstanceState::ReadOutputTensors( size_t total_batch_size, const std::vector& output_names, - const std::vector& output_tensors, + const std::vector& output_tensors, TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector* responses, uint64_t* compute_end_ns) { @@ -1284,62 +1635,106 @@ ModelInstanceState::ReadOutputTensors( CudaStream()); bool cuda_copy = false; - std::vector> string_buffers; + // The serialized string buffer must be valid until output copies are done + std::vector> string_buffer; for (size_t idx = 0; idx < output_names.size(); idx++) { std::string name = output_names[idx]; int op_index = output_index_map_[name]; - torch::Tensor output_flat; - try { - output_flat = output_tensors[op_index].contiguous().flatten(); - } - catch (std::exception& ex) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("output tensor '") + name + "' is not found").c_str())); - } + if (output_tensors[op_index].isTensor()) { + torch::Tensor output_flat; + try { + output_flat = + output_tensors[op_index].toTensor().contiguous().flatten(); + } + catch (std::exception& ex) { + RETURN_IF_ERROR(TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("output tensor '") + name + "' is not found") + .c_str())); + } - // Verify output datatype matches datatype from model config - TRITONSERVER_DataType output_dtype = - ConvertTorchTypeToDataType(output_flat.scalar_type()); - TRITONSERVER_DataType config_datatype = output_dtype_map_[name]; - if (config_datatype != output_dtype) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("configuration expects datatype TYPE_") + - TRITONSERVER_DataTypeString(config_datatype) + " for output '" + - name + "', model provides TYPE_" + - TRITONSERVER_DataTypeString(output_dtype)) - .c_str())); - } + // Verify output datatype matches datatype from model config + TRITONSERVER_DataType output_dtype = + ConvertTorchTypeToDataType(output_flat.scalar_type()); + TRITONSERVER_DataType config_datatype = output_dtype_map_[name]; + if (config_datatype != output_dtype) { + RETURN_IF_ERROR(TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("configuration expects datatype TYPE_") + + TRITONSERVER_DataTypeString(config_datatype) + " for output '" + + name + "', model provides TYPE_" + + TRITONSERVER_DataTypeString(output_dtype)) + .c_str())); + } - const char* output_buffer = - static_cast(output_flat.data_ptr()); + const char* output_buffer = + static_cast(output_flat.data_ptr()); - // Output tensors may not reside on the same device as model - torch::Device tensor_device = output_flat.device(); + // Output tensors may not reside on the same device as model + torch::Device tensor_device = output_flat.device(); - // Set output shape - std::vector batchn_shape; - auto shape = output_tensors[op_index].sizes(); - for (auto itr = shape.begin(); itr != shape.end(); itr++) { - batchn_shape.push_back(*itr); - } + // Get output shape + std::vector batchn_shape; + auto shape = output_tensors[op_index].toTensor().sizes(); + for (auto itr = shape.begin(); itr != shape.end(); itr++) { + batchn_shape.push_back(*itr); + } - if (batchn_shape.size() == 0) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( + if (batchn_shape.size() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("output '") + name + + "' is a scalar which is not supported.") + .c_str()); + } + + responder.ProcessTensor( + name, output_dtype, batchn_shape, output_buffer, + (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU + : TRITONSERVER_MEMORY_GPU, + (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index()); + + } else if (output_tensors[op_index].isList()) { + // Custom handling for string/bytes tensor... + + torch::List output_list = + output_tensors[op_index].toList(); + + // Get output shape + std::vector batchn_shape{(int64_t)output_list.size()}; + + size_t tensor_offset = 0; + + for (size_t idx = 0; idx < responses->size(); idx++) { + auto& response = (*responses)[idx]; + + const size_t tensor_element_cnt = GetElementCount(batchn_shape); + + // Only need an response tensor for requested outputs. + if (response != nullptr) { + TRITONBACKEND_Output* response_output; + RESPOND_AND_SET_NULL_IF_ERROR( + &response, TRITONBACKEND_ResponseOutput( + response, &response_output, name.c_str(), + TRITONSERVER_TYPE_BYTES, batchn_shape.data(), + batchn_shape.size())); + string_buffer.emplace_back(new std::string()); + cuda_copy |= SetStringOutputBuffer( + &output_list, &response, response_output, tensor_element_cnt, + tensor_offset, CudaStream(), string_buffer.back().get()); + } + + tensor_offset += tensor_element_cnt; + } + } else { + return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("output '") + name + - "' is a scalar which is not supported.") - .c_str())); + "' must be of type Tensor or List[str].") + .c_str()); } - responder.ProcessTensor( - name, output_dtype, batchn_shape, output_buffer, - (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU - : TRITONSERVER_MEMORY_GPU, - (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index()); - // PyTorch uses asynchronous execution to run the model. Setting the compute // end timestamp immediately after Execute() does not capture the complete // model execution time. When the first output buffer is accessed/copied by From ff103c40e7263076a33e84c16e30e6501143422c Mon Sep 17 00:00:00 2001 From: Hemant Jain Date: Thu, 19 May 2022 17:55:14 -0700 Subject: [PATCH 04/76] Enforce ordering of I/O if naming convention is not followed (#63) * Enforce ordering of I/O if naming convention is not followed * Enforce usage of consistent naming convention for inputs and outputs - Convention between inputs and outputs can differ * Use helper function GetNamingConvention - use switch case - use c++ style enum * Use class enum - cleanup unnecessary code blocks * Add clarifying comment about atoi usage * fix typo * Remove try catch for atoi and use checks for is digit instead --- src/libtorch.cc | 233 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 181 insertions(+), 52 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index e22b55a..e39a2ec 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -424,6 +424,14 @@ ModelState::ParseParameters() return nullptr; } +// The naming convention followed for inputs/outputs in the model configuration. +// Outputs don't support FORWARD_ARGUMENT. +enum class NamingConvention { + NAMED_INDEX, + FORWARD_ARGUMENT, + STRICT_CONFIG_ORDERING +}; + // // ModelInstanceState // @@ -476,6 +484,11 @@ class ModelInstanceState : public BackendModelInstance { std::vector* responses, uint64_t* compute_end_ns); + // Get the naming convention for inputs/outputs from the model configuration + TRITONSERVER_Error* GetNamingConvention( + NamingConvention* naming_convention, + const std::set& allowed_io); + ModelState* model_state_; // The full path to the TorchScript model file. @@ -597,21 +610,29 @@ ModelInstanceState::ValidateBooleanSequenceControl( if (*have_control) { std::string deliminator = "__"; int ip_index = 0; - try { - int start_pos = tensor_name.find(deliminator); - if (start_pos == -1) { - throw std::invalid_argument("input must follow naming convention"); - } - ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); - input_index_map_[tensor_name] = ip_index; - } - catch (std::exception& ex) { + int start_pos = tensor_name.find(deliminator); + if (start_pos == -1) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, ("input '" + tensor_name + - "' does not follow naming convention i.e. __.") + "' does not follow __ naming convention.") .c_str()); } + + // check if the index part of the name is not an integer + std::string index_str = tensor_name.substr(start_pos + 2); + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + } + + ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); + input_index_map_[tensor_name] = ip_index; } return nullptr; // success @@ -631,21 +652,29 @@ ModelInstanceState::ValidateTypedSequenceControl( if (*have_control) { std::string deliminator = "__"; int ip_index = 0; - try { - int start_pos = tensor_name.find(deliminator); - if (start_pos == -1) { - throw std::invalid_argument("input must follow naming convention"); - } - ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); - input_index_map_[tensor_name] = ip_index; - } - catch (std::exception& ex) { + int start_pos = tensor_name.find(deliminator); + if (start_pos == -1) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, ("input '" + tensor_name + - "' does not follow naming convention i.e. __.") + "' does not follow __ naming convention.") .c_str()); } + + // check if the index part of the name is not an integer + std::string index_str = tensor_name.substr(start_pos + 2); + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + } + + ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); + input_index_map_[tensor_name] = ip_index; } return nullptr; // success @@ -727,6 +756,8 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) } bool supports_batching = model_state_->MaxBatchSize() > 0; + NamingConvention naming_convention; + RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs)); for (size_t i = 0; i < ios.ArraySize(); i++) { triton::common::TritonJson::Value io; @@ -740,27 +771,24 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) // input names since they are the keys for the dictionary input_index_map_[io_name] = i; } else { - // input tensor name must be in 'allowed_inputs' or must follow the naming - // convention - auto itr = allowed_inputs.find(io_name); - if (itr != allowed_inputs.end()) { - input_index_map_[io_name] = std::distance(allowed_inputs.begin(), itr); - } else { - try { - int start_pos = io_name.find(deliminator); - if (start_pos == -1) { - throw std::invalid_argument("input must follow naming convention"); + switch (naming_convention) { + case NamingConvention::FORWARD_ARGUMENT: { + auto itr = allowed_inputs.find(io_name); + if (itr != allowed_inputs.end()) { + input_index_map_[io_name] = + std::distance(allowed_inputs.begin(), itr); } + break; + } + case NamingConvention::NAMED_INDEX: { + int start_pos = io_name.find(deliminator); ip_index = std::atoi(io_name.substr(start_pos + 2).c_str()); input_index_map_[io_name] = ip_index; + break; } - catch (std::exception& ex) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + io_name + - "' is neither an input argument to the model nor does it " - "follow the naming convention i.e. __.") - .c_str()); + case NamingConvention::STRICT_CONFIG_ORDERING: { + input_index_map_[io_name] = i; + break; } } } @@ -821,6 +849,8 @@ ModelInstanceState::ValidateOutputs() } const bool supports_batching = model_state_->MaxBatchSize() > 0; + NamingConvention naming_convention; + RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {})); for (size_t i = 0; i < ios.ArraySize(); i++) { triton::common::TritonJson::Value io; @@ -829,19 +859,18 @@ ModelInstanceState::ValidateOutputs() // Validate name std::string io_name; RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - try { - int start_pos = io_name.find(deliminator); - if (start_pos == -1) { - throw std::invalid_argument("output must follow naming convention"); + switch (naming_convention) { + case NamingConvention::NAMED_INDEX: { + int start_pos = io_name.find(deliminator); + op_index = std::atoi(io_name.substr(start_pos + 2).c_str()); + break; } - op_index = std::atoi(io_name.substr(start_pos + 2).c_str()); - } - catch (std::exception& ex) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("output '" + io_name + - "' does not follow naming convention i.e. __.") - .c_str()); + case NamingConvention::STRICT_CONFIG_ORDERING: { + op_index = i; + break; + } + default: + break; } // Validate data type @@ -1251,9 +1280,9 @@ ModelInstanceState::Execute( output_tensors->push_back(model_outputs_); } else { throw std::invalid_argument( - "output must be of type Tensor, List[str] or Tuple " - "containing one of these two types. It should not be a List / " - "Dictionary of Tensors or a Scalar"); + "output must be of type Tensor, List[str] or Tuple containing one of " + "these two types. It should not be a List / Dictionary of Tensors or " + "a Scalar"); } } catch (std::exception& ex) { @@ -1265,6 +1294,106 @@ ModelInstanceState::Execute( } } +TRITONSERVER_Error* +ModelInstanceState::GetNamingConvention( + NamingConvention* naming_convention, + const std::set& allowed_ios) +{ + // Rules for (non-Dictionary) input tensor names: + // 1. Must be in 'allowed_inputs' (arguments in the forward function) + // 2. Must follow the naming convention i.e. __ + // 3. If neither of the above conditions are satisfied, enforce strict + // ordering of model inputs. + // + // Rules for output tensor names: + // 1. Must follow the naming convention i.e. __ + // 2. If not, we enforce strict ordering of model outputs. + std::string deliminator = "__"; + std::string io_kind = "input"; + *naming_convention = NamingConvention::FORWARD_ARGUMENT; + + // symbolizes output + if (allowed_ios.size() == 0) { + io_kind = "output"; + *naming_convention = NamingConvention::NAMED_INDEX; + } + + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR( + model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios)); + + if (io_kind == "input") { + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + auto itr = allowed_ios.find(io_name); + if (itr == allowed_ios.end()) { + *naming_convention = NamingConvention::NAMED_INDEX; + break; + } + } + } + + // If not, check if inputs follow INDEX + if (*naming_convention == NamingConvention::NAMED_INDEX) { + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + int start_pos = io_name.find(deliminator); + if (start_pos == -1) { + *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; + break; + } else { + // check if the index part of the name is not an integer + std::string index_str = io_name.substr(start_pos + 2); + bool is_int = true; + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + is_int = false; + } + } + + if (!is_int) { + if (io_kind == "input") { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + ("input '" + io_name + + "' or previous input(s) are neither an input argument to the " + "model '" + + model_state_->Name() + + "' nor do they follow the __ naming convention. " + "Falling back to enforcing strict ordering from model " + "configuration.") + .c_str()); + } else { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + ("output '" + io_name + + "' or previous output(s) of the model '" + + model_state_->Name() + + "' do not follow the __ naming convention. " + "Falling back to enforcing strict ordering from model " + "configuration.") + .c_str()); + } + *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; + break; + } + } + } + } + + return nullptr; // success +} + // This function will return a tensor's contents as a contiguous // chunk in system memory. In some cases this will require copying the data. // If that happens, 'contiguous_buffer' will be set to hold the contiguous From df900ce3d3c03e7f34ce1e9518c09cc2bad520a5 Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Tue, 24 May 2022 13:42:51 -0700 Subject: [PATCH 05/76] Add nvtx markers which can be useful in perf profiling (#64) --- CMakeLists.txt | 5 +++++ src/libtorch.cc | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 909148f..a5ea654 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,7 @@ project(tritonpytorchbackend LANGUAGES C CXX) option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) +option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) option(TRITON_PYTORCH_ENABLE_TORCHTRT "Enable TorchTRT support" OFF) option(TRITON_PYTORCH_ENABLE_TORCHVISION "Enable Torchvision support" ON) @@ -120,6 +121,10 @@ else() endif() endif() # TRITON_ENABLE_GPU +if(${TRITON_ENABLE_NVTX}) + add_definitions(-DTRITON_ENABLE_NVTX=1) +endif() # TRITON_ENABLE_NVTX + # # Shared library implementing the Triton Backend API # diff --git a/src/libtorch.cc b/src/libtorch.cc index e39a2ec..a142f72 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -33,6 +33,7 @@ #include "triton/backend/backend_model.h" #include "triton/backend/backend_model_instance.h" #include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" #include "triton/core/tritonbackend.h" #ifdef TRITON_PYTORCH_ENABLE_TORCHVISION @@ -307,7 +308,6 @@ ModelState::ParseParameters() TRITONSERVER_ErrorDelete(err); } } - LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("Inference Mode is ") + @@ -926,6 +926,8 @@ ModelInstanceState::ProcessRequests( std::to_string(request_count) + " requests") .c_str()); + NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); + uint64_t exec_start_ns = 0; SET_TIMESTAMP(exec_start_ns); @@ -1188,6 +1190,8 @@ ModelInstanceState::Execute( std::vector* input_tensors, std::vector* output_tensors) { + NVTX_RANGE(nvtx_, "Execute " + Name()); + torch::jit::IValue model_outputs_; try { @@ -1758,6 +1762,8 @@ ModelInstanceState::ReadOutputTensors( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector* responses, uint64_t* compute_end_ns) { + NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name()); + BackendOutputResponder responder( requests, request_count, responses, model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(), From b8baa9310e06cb486dfb962d1f7ac3a8314a5ea2 Mon Sep 17 00:00:00 2001 From: Hemant Jain Date: Wed, 22 Jun 2022 16:06:32 -0700 Subject: [PATCH 06/76] Add jpeg CV deps to PyTorch (#65) - Needed by Torchvision --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5ea654..41aa7de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -182,6 +182,7 @@ set(OPENCV_LIBS "libopencv_imgproc.so" "libopencv_core.so" "libpng16.so" + "libjpeg.so" ) if (${TRITON_PYTORCH_DOCKER_BUILD}) @@ -220,6 +221,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgproc.so.3.4.11 libopencv_imgproc.so COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_core.so.3.4.11 libopencv_core.so COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0 libpng16.so + COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_def.so; fi" COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_def.so; fi" COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx2.so; fi" @@ -439,6 +441,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND ln -sf libopencv_imgproc.so libopencv_imgproc.so.${OPENCV_VERSION} COMMAND ln -sf libopencv_core.so libopencv_core.so.${OPENCV_VERSION} COMMAND ln -sf libpng16.so libpng16.so.16 + COMMAND ln -sf libjpeg.so libjpeg.so.8 RESULT_VARIABLE LINK_STATUS WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) if(LINK_STATUS AND NOT LINK_STATUS EQUAL 0) From f95175762771cb339a09ba1ff61b214f68973274 Mon Sep 17 00:00:00 2001 From: Hemant Jain Date: Thu, 23 Jun 2022 15:37:28 -0700 Subject: [PATCH 07/76] Fix intel mkl issue that causes segfault (#66) * Fix intel mkl issue that causes segfault - during destruction - causes freeze during inference * Add comment --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 41aa7de..e9fe9ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,6 +185,9 @@ set(OPENCV_LIBS "libjpeg.so" ) +# The patchelf commands ensure the MKL libraries are loaded correctly during runtime +# Without these, the framework/backend complains of missing libraries / symbols and +# in some cases leads to segmentation faults. if (${TRITON_PYTORCH_DOCKER_BUILD}) string(REPLACE ";" " " CONDA_LIBS_STR "${CONDA_LIBS}") @@ -228,6 +231,10 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx2.so; fi" COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx512.so; fi" COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx512.so; fi" + COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_vml_def.so; fi" + COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_intel_thread.so libmkl_vml_def.so; fi" + COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_vml_def.so; fi" + COMMAND /bin/sh -c "patchelf --add-needed libmkl_intel_lp64.so libmkl_intel_thread.so" COMMAND docker rm pytorch_backend_ptlib COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}" VERBATIM From 858be505819611ac5bd6367491c3c260e80ef416 Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Fri, 24 Jun 2022 08:57:10 -0700 Subject: [PATCH 08/76] Adding verification for link creation (#67) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9fe9ce..b5c4754 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,7 +234,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_vml_def.so; fi" COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_intel_thread.so libmkl_vml_def.so; fi" COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_vml_def.so; fi" - COMMAND /bin/sh -c "patchelf --add-needed libmkl_intel_lp64.so libmkl_intel_thread.so" + COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so ]; then patchelf --add-needed libmkl_intel_lp64.so libmkl_intel_thread.so; fi" COMMAND docker rm pytorch_backend_ptlib COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}" VERBATIM From 663ee99e1b374c0722e71175d28e05e968f49630 Mon Sep 17 00:00:00 2001 From: "Jeffery (Zeyu) Zhao" Date: Wed, 6 Jul 2022 08:00:59 +0800 Subject: [PATCH 09/76] Add CUDA cache cleaning flag to pytorch backend (#61) * Add CUDA cache cleaning flag to pytorch backend * minor fixes minor code formatting change per review * Add more notes to README --- README.md | 17 +++++++++++++++++ src/libtorch.cc | 42 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4b41c1a..d5050f6 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,23 @@ key: "ENABLE_WEIGHT_SHARING" } ``` +* `ENABLE_CACHE_CLEANING`: Boolean flag to enable CUDA cache cleaning after each model execution. +If not specified, cache cleaning is disabled. This flag has no effect if model is on CPU. +Setting this flag to true will negatively impact the performance due to additional CUDA cache +cleaning operation after each model execution. Therefore, you should only use this flag if you +serve multiple models with Triton and encounter CUDA out of memory issue during model executions. + +The section of model config file specifying this parameter will look like: + +``` +parameters: { +key: "ENABLE_CACHE_CLEANING" + value: { + string_value:"true" + } +} +``` + * Additional Optimizations: Three additional boolean parameters are available to disable certain Torch optimizations that can sometimes cause latency regressions in models with complex execution modes and dynamic shapes. If not specified, all are enabled by default. diff --git a/src/libtorch.cc b/src/libtorch.cc index a142f72..5449064 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -98,6 +98,7 @@ class ModelState : public BackendModel { { return enable_nvfuser_pair_; } + bool EnabledCacheCleaning(){ return enable_cache_cleaning_; } bool EnabledWeightSharing() { return enable_weight_sharing_; } @@ -114,6 +115,9 @@ class ModelState : public BackendModel { // Flag to indicate whether inference mode is enabled. Defaults to false. bool enable_inference_mode_; + // Flag to indicate whether cache clearning after each run is enabled. Defaults to false. + bool enable_cache_cleaning_; + // Flag to indicate whether weight sharing is enabled. Defaults to false. bool enable_weight_sharing_; @@ -173,7 +177,8 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) ModelState::ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model), enable_optimized_execution_(true), - enable_inference_mode_(false), enable_weight_sharing_(false), + enable_inference_mode_(false), enable_cache_cleaning_(false), + enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}), enable_jit_profiling_pair_({false, true}), enable_jit_executor_pair_({false, true}), @@ -298,6 +303,25 @@ ModelState::ParseParameters() " for model instance '" + Name() + "'") .c_str()); + // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then + // no update is made to 'enable_cache_cleaning_'. + err = ParseParameter( + params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Cache Cleaning is ") + + (enable_cache_cleaning_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made // to 'enable_inference_mode_'. err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_); @@ -453,6 +477,9 @@ class ModelInstanceState : public BackendModelInstance { void ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count); + // Clear CUDA cache + void ClearCache(); + private: ModelInstanceState( ModelState* model_state, @@ -585,9 +612,8 @@ ModelInstanceState::ModelInstanceState( THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); } -ModelInstanceState::~ModelInstanceState() +void ModelInstanceState::ClearCache() { - torch_model_.reset(); #ifdef TRITON_ENABLE_GPU if (device_.is_cuda()) { c10::cuda::CUDACachingAllocator::emptyCache(); @@ -595,6 +621,12 @@ ModelInstanceState::~ModelInstanceState() #endif // TRITON_ENABLE_GPU } +ModelInstanceState::~ModelInstanceState() +{ + torch_model_.reset(); + ClearCache(); +} + TRITONSERVER_Error* ModelInstanceState::ValidateBooleanSequenceControl( triton::common::TritonJson::Value& sequence_batching, @@ -2081,6 +2113,10 @@ TRITONBACKEND_ModelInstanceExecute( // specific request. instance_state->ProcessRequests(requests, request_count); + if(model_state->EnabledCacheCleaning()) { + instance_state->ClearCache(); + } + return nullptr; // success } From 3ecda562665cfb29a3584fdce80610450a98d821 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 7 Jul 2022 17:25:51 -0700 Subject: [PATCH 10/76] Add reshape+batching and dynamic batching support for string I/O (#69) * reshape+batching and dynamic batching support for string I/O * Address comment * Address comment --- README.md | 2 -- src/libtorch.cc | 84 +++++++++++++++++++------------------------------ 2 files changed, 33 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index d5050f6..da9d391 100644 --- a/README.md +++ b/README.md @@ -231,5 +231,3 @@ a List of Strings as input(s) / produces a List of String as output(s). For thes Triton allows users to pass String input(s)/recieve String output(s) using the String datatype. As a limitation of using List instead of Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported for I/O of String type. -Batching is not allowed for PyTorch models with String I/O. For these models, -the user must specify `max_batch_size: 0` in the configuration. diff --git a/src/libtorch.cc b/src/libtorch.cc index 5449064..336629c 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -535,6 +535,9 @@ class ModelInstanceState : public BackendModelInstance { // If the input to the tensor is a dictionary of tensors. bool is_dict_input_; + + // If the model supports batching. + bool supports_batching_; }; TRITONSERVER_Error* @@ -607,6 +610,7 @@ ModelInstanceState::ModelInstanceState( expected_input_cnt += 1; } } + supports_batching_ = model_state_->MaxBatchSize() > 0; THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt)); THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); @@ -787,7 +791,6 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) "specified."); } - bool supports_batching = model_state_->MaxBatchSize() > 0; NamingConvention naming_convention; RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs)); @@ -837,8 +840,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) .c_str()); } - // Validate shape for String inputs. Only allow 1 dimension and no - // batching. + // Validate shape for String inputs. Only allow 1 dimension. if (io_dtype == "TYPE_STRING") { // If a reshape is provided for the input then use that when // validating the model shapes. @@ -850,7 +852,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); } - if ((dims.size() > 1) || supports_batching) { + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, ("Triton only supports 1 dimensional List of String as input for " @@ -880,7 +882,6 @@ ModelInstanceState::ValidateOutputs() "specified."); } - const bool supports_batching = model_state_->MaxBatchSize() > 0; NamingConvention naming_convention; RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {})); @@ -917,8 +918,7 @@ ModelInstanceState::ValidateOutputs() .c_str()); } - // Validate shape for String outputs. Only allow 1 dimension and no - // batching. + // Validate shape for String outputs. Only allow 1 dimension. if (io_dtype == "TYPE_STRING") { // If a reshape is provided for the output then use that when // validating the model shapes. @@ -930,7 +930,7 @@ ModelInstanceState::ValidateOutputs() RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); } - if ((dims.size() > 1) || supports_batching) { + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, ("Triton only supports 1 dimensional List of String as output for " @@ -1015,7 +1015,7 @@ ModelInstanceState::ProcessRequests( for (size_t i = 0; i < request_count; i++) { if (max_batch_size > 0) { // Retrieve the batch size from one of the inputs, if the model - // supports batching, the first dimension size is batch size + // supports batching, the first dimension size is batch size. TRITONBACKEND_Input* input; TRITONSERVER_Error* err = TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); @@ -1294,7 +1294,7 @@ ModelInstanceState::Execute( if (list_output.elementType()->kind() != c10::TypeKind::StringType) { throw std::invalid_argument( "output at index " + std::to_string(op_index) + - " must be of type Tensor or List[str], recieved List[" + + " must be of type Tensor or List[str], received List[" + list_output.elementType()->str() + "]"); } output_tensors->push_back(m_op); @@ -1310,7 +1310,7 @@ ModelInstanceState::Execute( auto list_output = model_outputs_.toList(); if (list_output.elementType()->kind() != c10::TypeKind::StringType) { throw std::invalid_argument( - "output must be of type Tensor or List[str], recieved List[" + + "output must be of type Tensor or List[str], received List[" + list_output.elementType()->str() + "]"); } output_tensors->push_back(model_outputs_); @@ -1505,8 +1505,7 @@ GetContiguousInputContent( } void -FillStringTensor( - torch::List* input_list, const size_t idx, const size_t cnt) +FillStringTensor(torch::List* input_list, const size_t cnt) { for (size_t c = 0; c < cnt; ++c) { input_list->push_back(""); @@ -1517,9 +1516,8 @@ bool SetStringInputTensor( torch::List* input_list, TRITONBACKEND_Input* input, const char* name, const uint32_t buffer_count, - const size_t request_element_cnt, const size_t tensor_offset, - TRITONBACKEND_Response** response, cudaStream_t stream, - const char* host_policy_name) + const size_t request_element_cnt, TRITONBACKEND_Response** response, + cudaStream_t stream, const char* host_policy_name) { bool cuda_copy = false; size_t element_idx = 0; @@ -1537,9 +1535,7 @@ SetStringInputTensor( stream, &cuda_copy); if (err != nullptr) { RESPOND_AND_SET_NULL_IF_ERROR(response, err); - FillStringTensor( - input_list, tensor_offset + element_idx, - request_element_cnt - element_idx); + FillStringTensor(input_list, request_element_cnt - element_idx); return cuda_copy; } @@ -1564,9 +1560,6 @@ SetStringInputTensor( std::to_string(element_idx + 1) + " for inference input '" + name + "', expecting " + std::to_string(request_element_cnt)) .c_str())); - FillStringTensor( - input_list, tensor_offset + element_idx, - request_element_cnt - element_idx); return cuda_copy; } @@ -1585,9 +1578,7 @@ SetStringInputTensor( std::to_string(len) + " but only " + std::to_string(content_byte_size) + " bytes available") .c_str())); - FillStringTensor( - input_list, tensor_offset + element_idx, - request_element_cnt - element_idx); + FillStringTensor(input_list, request_element_cnt - element_idx); return cuda_copy; } @@ -1608,9 +1599,9 @@ SetStringInputTensor( " strings for inference input '" + name + "', got " + std::to_string(element_idx)) .c_str())); - FillStringTensor( - input_list, tensor_offset + element_idx, - request_element_cnt - element_idx); + if (element_idx < request_element_cnt) { + FillStringTensor(input_list, request_element_cnt - element_idx); + } } return cuda_copy; @@ -1620,7 +1611,7 @@ bool SetStringOutputBuffer( torch::List* tensor, TRITONBACKEND_Response** response, TRITONBACKEND_Output* response_output, const size_t tensor_element_count, - const size_t tensor_offset, cudaStream_t stream, std::string* serialized) + cudaStream_t stream, std::string* serialized) { bool cuda_copy = false; @@ -1677,8 +1668,6 @@ ModelInstanceState::SetInputTensors( std::vector* input_tensors, std::vector* input_memories, bool* cuda_copy) { - const int max_batch_size = model_state_->MaxBatchSize(); - // InferenceMode should be used to guard all tensors operations torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); @@ -1705,7 +1694,7 @@ ModelInstanceState::SetInputTensors( // The shape for the entire input patch, [total_batch_size, ...] std::vector batchn_shape( input_shape, input_shape + input_dims_count); - if (max_batch_size != 0) { + if (supports_batching_) { batchn_shape[0] = total_batch_size; } @@ -1735,20 +1724,10 @@ ModelInstanceState::SetInputTensors( if (input_datatype == TRITONSERVER_TYPE_BYTES) { - if (batchn_shape.size() != 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, ("Triton only supports 1 dimensional " - "List of string as input for '" + - std::string(input_name) + "'") - .c_str()); - } - // Create the PyTorch list to hold the strings. torch::List input_list; input_list.reserve(batchn_shape[0]); - size_t tensor_offset = 0; - for (size_t idx = 0; idx < request_count; idx++) { TRITONBACKEND_Input* input; RESPOND_AND_SET_NULL_IF_ERROR( @@ -1767,9 +1746,7 @@ ModelInstanceState::SetInputTensors( *cuda_copy |= SetStringInputTensor( &input_list, input, input_name, buffer_count, batch_element_cnt, - tensor_offset, &((*responses)[idx]), CudaStream(), - HostPolicyName().c_str()); - tensor_offset += batch_element_cnt; + &((*responses)[idx]), CudaStream(), HostPolicyName().c_str()); } (*input_tensors)[input_index_map_[input_name]] = input_list; @@ -1864,18 +1841,25 @@ ModelInstanceState::ReadOutputTensors( } else if (output_tensors[op_index].isList()) { // Custom handling for string/bytes tensor... - torch::List output_list = output_tensors[op_index].toList(); // Get output shape std::vector batchn_shape{(int64_t)output_list.size()}; - size_t tensor_offset = 0; - for (size_t idx = 0; idx < responses->size(); idx++) { + auto& request = requests[idx]; auto& response = (*responses)[idx]; + if (supports_batching_ != 0) { + TRITONBACKEND_Input* input; + TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input); + const int64_t* shape; + TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); + batchn_shape[0] = shape[0]; + } + const size_t tensor_element_cnt = GetElementCount(batchn_shape); // Only need an response tensor for requested outputs. @@ -1889,10 +1873,8 @@ ModelInstanceState::ReadOutputTensors( string_buffer.emplace_back(new std::string()); cuda_copy |= SetStringOutputBuffer( &output_list, &response, response_output, tensor_element_cnt, - tensor_offset, CudaStream(), string_buffer.back().get()); + CudaStream(), string_buffer.back().get()); } - - tensor_offset += tensor_element_cnt; } } else { return TRITONSERVER_ErrorNew( From 1f89243397783234d94c0aa206be2e4953d217b7 Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Fri, 8 Jul 2022 10:14:10 -0700 Subject: [PATCH 11/76] Use the SetModelConfig backend utility (#71) --- src/libtorch.cc | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index 336629c..c1d403e 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -159,15 +159,7 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) triton_model, &auto_complete_config)); if (auto_complete_config) { RETURN_IF_ERROR((*state)->AutoCompleteConfig()); - - triton::common::TritonJson::WriteBuffer json_buffer; - (*state)->ModelConfig().Write(&json_buffer); - - TRITONSERVER_Message* message; - RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson( - &message, json_buffer.Base(), json_buffer.Size())); - RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig( - triton_model, 1 /* config_version */, message)); + RETURN_IF_ERROR((*state)->SetModelConfig()); } RETURN_IF_ERROR((*state)->ParseParameters()); From 3421d0b04531866e4fa59e0726b7d2fc60274834 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Sat, 16 Jul 2022 23:20:20 -0400 Subject: [PATCH 12/76] Fix pytorch forward argument naming convention (#72) --- src/libtorch.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index c1d403e..a7b312a 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -506,7 +506,7 @@ class ModelInstanceState : public BackendModelInstance { // Get the naming convention for inputs/outputs from the model configuration TRITONSERVER_Error* GetNamingConvention( NamingConvention* naming_convention, - const std::set& allowed_io); + const std::vector& allowed_io); ModelState* model_state_; @@ -713,7 +713,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) { // Collect all the expected input tensor names and validate that the model // configuration specifies only those. - std::set allowed_inputs; + std::vector allowed_inputs; const torch::jit::Method& method = torch_model_->get_method("forward"); const auto& schema = method.function().getSchema(); @@ -755,7 +755,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) "Dict(str, Tensor) or input(s) of type Tensor are supported.") .c_str()); } - allowed_inputs.emplace(arguments.at(i).name()); + allowed_inputs.emplace_back(arguments.at(i).name()); } // If all inputs are tensors, match number of expected inputs between model @@ -800,7 +800,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) } else { switch (naming_convention) { case NamingConvention::FORWARD_ARGUMENT: { - auto itr = allowed_inputs.find(io_name); + auto itr = std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name); if (itr != allowed_inputs.end()) { input_index_map_[io_name] = std::distance(allowed_inputs.begin(), itr); @@ -1325,7 +1325,7 @@ ModelInstanceState::Execute( TRITONSERVER_Error* ModelInstanceState::GetNamingConvention( NamingConvention* naming_convention, - const std::set& allowed_ios) + const std::vector& allowed_ios) { // Rules for (non-Dictionary) input tensor names: // 1. Must be in 'allowed_inputs' (arguments in the forward function) @@ -1358,7 +1358,7 @@ ModelInstanceState::GetNamingConvention( // Validate name std::string io_name; RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - auto itr = allowed_ios.find(io_name); + auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name); if (itr == allowed_ios.end()) { *naming_convention = NamingConvention::NAMED_INDEX; break; From 8ae6cd42cf21cc6f232c341289fcd9bcf3818c16 Mon Sep 17 00:00:00 2001 From: hemantj Date: Wed, 6 Jul 2022 09:55:52 -0700 Subject: [PATCH 13/76] Fix path for new directory structure --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b5c4754..5c1a9bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,7 +215,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/include include/torch - COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/codegen + COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/. COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_videoio.so.3.4.11 libopencv_videoio.so COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_highgui.so.3.4.11 libopencv_highgui.so From 8c1c6bd741814bd7f7074801d7d01b09dd0f5a8a Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 8 Aug 2022 15:00:37 -0400 Subject: [PATCH 14/76] Fix multiple instance performance (#73) * Fix multiple instance performance * Fix cuda stream destruction * Fix CPU version * Refactor event capturing * Review edit * fix up --- src/libtorch.cc | 230 ++++++++++++++++++++++++++++-------------- src/libtorch_utils.cc | 14 +++ src/libtorch_utils.h | 11 +- 3 files changed, 174 insertions(+), 81 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index a7b312a..76567dc 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -49,6 +49,7 @@ #ifdef TRITON_ENABLE_GPU #include +#include #include #endif // TRITON_ENABLE_GPU @@ -98,9 +99,10 @@ class ModelState : public BackendModel { { return enable_nvfuser_pair_; } - bool EnabledCacheCleaning(){ return enable_cache_cleaning_; } + bool EnabledCacheCleaning() { return enable_cache_cleaning_; } bool EnabledWeightSharing() { return enable_weight_sharing_; } + const std::vector& ModelOutputs() { return output_names_; } private: ModelState(TRITONBACKEND_Model* triton_model); @@ -115,7 +117,8 @@ class ModelState : public BackendModel { // Flag to indicate whether inference mode is enabled. Defaults to false. bool enable_inference_mode_; - // Flag to indicate whether cache clearning after each run is enabled. Defaults to false. + // Flag to indicate whether cache cleaning after each run is enabled. + // Defaults to false. bool enable_cache_cleaning_; // Flag to indicate whether weight sharing is enabled. Defaults to false. @@ -138,6 +141,10 @@ class ModelState : public BackendModel { std::map< std::pair, std::shared_ptr> torch_models_; + + // List of all the outputs specified in the output section of model + // configuration. + std::vector output_names_; }; TRITONSERVER_Error* @@ -170,12 +177,27 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) ModelState::ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model), enable_optimized_execution_(true), enable_inference_mode_(false), enable_cache_cleaning_(false), - enable_weight_sharing_(false), - enable_tensor_fuser_pair_({false, true}), + enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}), enable_jit_profiling_pair_({false, true}), enable_jit_executor_pair_({false, true}), enable_nvfuser_pair_({false, false}) { + output_names_.clear(); + + triton::common::TritonJson::Value ios; + THROW_IF_BACKEND_INSTANCE_ERROR(ModelConfig().MemberAsArray("output", &ios)); + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + THROW_IF_BACKEND_INSTANCE_ERROR(ios.IndexAsObject(i, &io)); + + // Use names from ModelConfig by reference since the model + // config will persist longer than this inference execution. + const char* io_name; + size_t io_name_len; + THROW_IF_BACKEND_INSTANCE_ERROR( + io.MemberAsString("name", &io_name, &io_name_len)); + output_names_.emplace_back(io_name); + } } TRITONSERVER_Error* @@ -497,11 +519,12 @@ class ModelInstanceState : public BackendModelInstance { std::vector* input_tensors, std::vector* input_memories, bool* cuda_copy); TRITONSERVER_Error* ReadOutputTensors( - size_t total_batch_size, const std::vector& output_names, + size_t total_batch_size, const std::vector& output_tensors, TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector* responses, - uint64_t* compute_end_ns); + std::vector* responses); + TRITONSERVER_Error* RecordBackendTimestamp( + uint64_t* timestamp, void* cuda_event); // Get the naming convention for inputs/outputs from the model configuration TRITONSERVER_Error* GetNamingConvention( @@ -530,6 +553,13 @@ class ModelInstanceState : public BackendModelInstance { // If the model supports batching. bool supports_batching_; + +#ifdef TRITON_ENABLE_GPU + // PyTorch stream used for execution of inferences. + cudaEvent_t compute_input_start_event_; + cudaEvent_t compute_infer_start_event_; + cudaEvent_t compute_output_start_event_; +#endif }; TRITONSERVER_Error* @@ -556,7 +586,23 @@ ModelInstanceState::ModelInstanceState( model_state_(model_state), device_(torch::kCPU), is_dict_input_(false) { if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU device_ = torch::Device(torch::kCUDA, DeviceId()); + // Need to set the CUDA context so that the context that events are + // created on match with contexts that events are recorded with. + THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( + cudaSetDevice(DeviceId()), TRITONSERVER_ERROR_INTERNAL, + "Failed to set the device")); + THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( + cudaEventCreate(&compute_input_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event")); + THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( + cudaEventCreate(&compute_infer_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event")); + THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( + cudaEventCreate(&compute_output_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event")); +#endif } THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel( @@ -570,6 +616,7 @@ ModelInstanceState::ModelInstanceState( } } + // If this is a sequence model then make sure that the required // inputs are present in the model and have the correct shape and // datatype. @@ -608,7 +655,8 @@ ModelInstanceState::ModelInstanceState( THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); } -void ModelInstanceState::ClearCache() +void +ModelInstanceState::ClearCache() { #ifdef TRITON_ENABLE_GPU if (device_.is_cuda()) { @@ -800,7 +848,8 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) } else { switch (naming_convention) { case NamingConvention::FORWARD_ARGUMENT: { - auto itr = std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name); + auto itr = + std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name); if (itr != allowed_inputs.end()) { input_index_map_[io_name] = std::distance(allowed_inputs.begin(), itr); @@ -950,6 +999,14 @@ ModelInstanceState::ProcessRequests( std::to_string(request_count) + " requests") .c_str()); + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + at::cuda::CUDAStream torch_stream = + at::cuda::getStreamFromExternal(stream_, DeviceId()); + at::cuda::setCurrentCUDAStream(torch_stream); +#endif + } + NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); uint64_t exec_start_ns = 0; @@ -1003,7 +1060,6 @@ ModelInstanceState::ProcessRequests( } } - for (size_t i = 0; i < request_count; i++) { if (max_batch_size > 0) { // Retrieve the batch size from one of the inputs, if the model @@ -1056,6 +1112,15 @@ ModelInstanceState::ProcessRequests( std::vector input_memories; bool cuda_copy = false; std::unique_ptr collector; + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ConvertCUDAStatusToTritonError( + cudaEventRecord(compute_input_start_event_, stream_), + TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); +#endif + } if (!all_response_failed) { collector.reset(new BackendInputCollector( @@ -1070,52 +1135,24 @@ ModelInstanceState::ProcessRequests( &cuda_copy)); } - // Request to retrieve all model outputs. 'output_names' and - // 'output_tensors' are parallel vectors and so must be kept in - // sync. - std::vector output_names; - std::vector output_tensors; - if (!all_response_failed) { - triton::common::TritonJson::Value ios; - TRITONSERVER_Error* err = - model_state_->ModelConfig().MemberAsArray("output", &ios); - if (err == nullptr) { - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - err = ios.IndexAsObject(i, &io); - if (err != nullptr) { - break; - } - - // Use names from ModelConfig by reference since the model - // config will persist longer than this inference execution. - const char* io_name; - size_t io_name_len; - err = io.MemberAsString("name", &io_name, &io_name_len); - if (err != nullptr) { - break; - } - - output_names.emplace_back(io_name); - } - } - - if (err != nullptr) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, err); - output_names.clear(); - } - } - -// Wait for any in-flight input tensor copies to complete. + // If the instance kind is not GPU, we need to synchronize the CUDA stream + if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_GPU) { #ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(CudaStream()); - } + if (cuda_copy) { + cudaStreamSynchronize(stream_); + cuda_copy = false; + } #endif + } + std::vector output_tensors; uint64_t compute_start_ns = 0; - SET_TIMESTAMP(compute_start_ns); + + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_start_ns, + reinterpret_cast(&compute_infer_start_event_))); // Run... if (!all_response_failed) { @@ -1135,7 +1172,7 @@ ModelInstanceState::ProcessRequests( int max_index = output_tensors.size() - 1; if (!all_response_failed) { - for (const auto& name : output_names) { + for (const auto& name : model_state_->ModelOutputs()) { int op_index = output_index_map_[name]; if ((op_index < 0) || (op_index > max_index)) { RESPOND_ALL_AND_SET_TRUE_IF_ERROR( @@ -1155,14 +1192,19 @@ ModelInstanceState::ProcessRequests( } uint64_t compute_end_ns = 0; + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_end_ns, + reinterpret_cast(&compute_output_start_event_))); if (!all_response_failed) { if (!invalid_index) { RESPOND_ALL_AND_SET_TRUE_IF_ERROR( responses, request_count, all_response_failed, ReadOutputTensors( - total_batch_size, output_names, output_tensors, requests, - request_count, &responses, &compute_end_ns)); + total_batch_size, output_tensors, requests, request_count, + &responses)); } } @@ -1182,6 +1224,33 @@ ModelInstanceState::ProcessRequests( } } + // We don't need an explicit CUDA syncrhonization here since we have already + // synchronized the stream in the ReadOutputTensors function. + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + float compute_input_duration = 0; + float compute_infer_duration = 0; + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ConvertCUDAStatusToTritonError( + cudaEventElapsedTime( + &compute_input_duration, compute_input_start_event_, + compute_infer_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time")); + + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ConvertCUDAStatusToTritonError( + cudaEventElapsedTime( + &compute_infer_duration, compute_infer_start_event_, + compute_output_start_event_), + TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time")); + + compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); + compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); +#endif + } + // Report statistics for each request. for (uint32_t r = 0; r < request_count; ++r) { auto& request = requests[r]; @@ -1714,7 +1783,6 @@ ModelInstanceState::SetInputTensors( ? options.device(torch::kCUDA, device_.index()) : options.device(torch::kCPU); - if (input_datatype == TRITONSERVER_TYPE_BYTES) { // Create the PyTorch list to hold the strings. torch::List input_list; @@ -1758,10 +1826,10 @@ ModelInstanceState::SetInputTensors( TRITONSERVER_Error* ModelInstanceState::ReadOutputTensors( - size_t total_batch_size, const std::vector& output_names, + size_t total_batch_size, const std::vector& output_tensors, TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector* responses, uint64_t* compute_end_ns) + std::vector* responses) { NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name()); @@ -1773,8 +1841,8 @@ ModelInstanceState::ReadOutputTensors( bool cuda_copy = false; // The serialized string buffer must be valid until output copies are done std::vector> string_buffer; - for (size_t idx = 0; idx < output_names.size(); idx++) { - std::string name = output_names[idx]; + for (size_t idx = 0; idx < model_state_->ModelOutputs().size(); idx++) { + std::string name = model_state_->ModelOutputs()[idx]; int op_index = output_index_map_[name]; if (output_tensors[op_index].isTensor()) { @@ -1875,34 +1943,40 @@ ModelInstanceState::ReadOutputTensors( "' must be of type Tensor or List[str].") .c_str()); } - - // PyTorch uses asynchronous execution to run the model. Setting the compute - // end timestamp immediately after Execute() does not capture the complete - // model execution time. When the first output buffer is accessed/copied by - // ProcessTensor(), there is a synchronization that is done to ensure the - // data is correctly copied from the output tensor. To avoid overheads of - // additional synchronization, we continue to use the default cuda stream. - // However the drawback of this is that the compute infer time reported - // would be slightly later than it is in reality and the compute output time - // reported would be smaller than it is in reality. We allow this because - // synchronizing manually negatively impacts performance. - if (idx == 0) { - SET_TIMESTAMP(*compute_end_ns); - } } // Finalize and wait for any pending buffer copies. cuda_copy |= responder.Finalize(); + if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_GPU) { #ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); + if (cuda_copy) { + cudaStreamSynchronize(stream_); + cuda_copy = false; + } +#endif } -#endif // TRITON_ENABLE_GPU return nullptr; } +TRITONSERVER_Error* +ModelInstanceState::RecordBackendTimestamp( + uint64_t* timestamp, void* cuda_event) +{ + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + cudaEvent_t* lcuda_event = reinterpret_cast(cuda_event); + RETURN_IF_ERROR(ConvertCUDAStatusToTritonError( + cudaEventRecord(*lcuda_event, stream_), TRITONSERVER_ERROR_INTERNAL, + "Failed to record the event.")); +#endif + } else { + SET_TIMESTAMP(*timestamp); + } + return nullptr; +} + ///////////// extern "C" { @@ -2087,7 +2161,7 @@ TRITONBACKEND_ModelInstanceExecute( // specific request. instance_state->ProcessRequests(requests, request_count); - if(model_state->EnabledCacheCleaning()) { + if (model_state->EnabledCacheCleaning()) { instance_state->ClearCache(); } diff --git a/src/libtorch_utils.cc b/src/libtorch_utils.cc index a554ba9..699c742 100644 --- a/src/libtorch_utils.cc +++ b/src/libtorch_utils.cc @@ -149,4 +149,18 @@ ParseParameter( return nullptr; } +#ifdef TRITON_ENABLE_GPU +TRITONSERVER_Error* +ConvertCUDAStatusToTritonError( + cudaError_t cuda_error,TRITONSERVER_Error_Code code, const char* msg) +{ + if (cuda_error != cudaSuccess) { + return TRITONSERVER_ErrorNew( + code, + (std::string(msg) + ": " + cudaGetErrorString(cuda_error)).c_str()); + } + return nullptr; // success +} +#endif + }}} // namespace triton::backend::pytorch diff --git a/src/libtorch_utils.h b/src/libtorch_utils.h index e112037..a8f0c0d 100644 --- a/src/libtorch_utils.h +++ b/src/libtorch_utils.h @@ -51,9 +51,14 @@ std::pair ConvertDataTypeToTorchType( std::pair ModelConfigDataTypeToTorchType( const std::string& data_type_str); -// If the key 'mkey' is present in 'params' then update 'value' with the value -// associated with that key. If 'mkey' is not present in 'params' then no update -// is made to 'value'. +#ifdef TRITON_ENABLE_GPU +TRITONSERVER_Error* ConvertCUDAStatusToTritonError( + cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg); +#endif + +// If the key 'mkey' is present in 'params' then update 'value' with the +// value associated with that key. If 'mkey' is not present in 'params' then +// no update is made to 'value'. TRITONSERVER_Error* ParseParameter( triton::common::TritonJson::Value& params, const std::string& mkey, bool* value); From bee8fde75ce8400c26cf7fe84df6f2cb7be34437 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 9 Aug 2022 12:48:20 -0400 Subject: [PATCH 15/76] Fix CPU only build (#75) --- src/libtorch.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index 76567dc..cc3ab55 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -554,12 +554,9 @@ class ModelInstanceState : public BackendModelInstance { // If the model supports batching. bool supports_batching_; -#ifdef TRITON_ENABLE_GPU - // PyTorch stream used for execution of inferences. cudaEvent_t compute_input_start_event_; cudaEvent_t compute_infer_start_event_; cudaEvent_t compute_output_start_event_; -#endif }; TRITONSERVER_Error* @@ -616,7 +613,6 @@ ModelInstanceState::ModelInstanceState( } } - // If this is a sequence model then make sure that the required // inputs are present in the model and have the correct shape and // datatype. From 0220e01259697a691b552fd9a0553b4452281f17 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 15 Aug 2022 22:28:37 -0400 Subject: [PATCH 16/76] Fix stream synchronization (#77) --- src/libtorch.cc | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index cc3ab55..4cb83d2 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -1131,15 +1131,12 @@ ModelInstanceState::ProcessRequests( &cuda_copy)); } - // If the instance kind is not GPU, we need to synchronize the CUDA stream - if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_GPU) { #ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - cuda_copy = false; - } -#endif + if (cuda_copy) { + cudaStreamSynchronize(stream_); + cuda_copy = false; } +#endif std::vector output_tensors; uint64_t compute_start_ns = 0; @@ -1944,14 +1941,13 @@ ModelInstanceState::ReadOutputTensors( // Finalize and wait for any pending buffer copies. cuda_copy |= responder.Finalize(); - if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_GPU) { #ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - cuda_copy = false; - } + // We have to always synchronize the stream. This is to make sure that + // the events on the cuda stream are synchronized. Otherwise, the events + // are only guaranteed to be synchronized if the model provides the output + // on GPU. + cudaStreamSynchronize(stream_); #endif - } return nullptr; } From 5477b119214a066ba171d5b345c2e068998e219d Mon Sep 17 00:00:00 2001 From: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Date: Wed, 17 Aug 2022 14:34:49 -0700 Subject: [PATCH 17/76] Fix possible "double send" of responses. (#79) * [DO NOT MERGE] WAR possible segfault * Add comment --- src/libtorch.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index 4cb83d2..4e327a3 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -1221,23 +1221,25 @@ ModelInstanceState::ProcessRequests( // synchronized the stream in the ReadOutputTensors function. if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { #ifdef TRITON_ENABLE_GPU + // [FIXME] in the case of cudaEventElapsedTime failure, should handle + // stats reporting more gracefully as the durations are inaccurate float compute_input_duration = 0; float compute_infer_duration = 0; - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, + LOG_IF_ERROR( ConvertCUDAStatusToTritonError( cudaEventElapsedTime( &compute_input_duration, compute_input_start_event_, compute_infer_start_event_), - TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time")); + TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), + "Failed to capture elapsed time"); - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, + LOG_IF_ERROR( ConvertCUDAStatusToTritonError( cudaEventElapsedTime( &compute_infer_duration, compute_infer_start_event_, compute_output_start_event_), - TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time")); + TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), + "Failed to capture elapsed time"); compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); From 935f4a5afbbece6d79dd9114eff0bf06f2c849f4 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 1 Sep 2022 15:01:37 -0400 Subject: [PATCH 18/76] Remove PyTorch multiple instance known issue (#80) --- README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/README.md b/README.md index da9d391..616a204 100644 --- a/README.md +++ b/README.md @@ -218,14 +218,6 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by state and a restart of the server may be required to continue serving successfully. -* Multiple instances of the PyTorch model on GPU do not always - increase performance. Due to thread specific caching in PyTorch, using - multiple instances of the model interact negatively. See - [here](https://github.com/pytorch/pytorch/issues/27902) for more details. - Setting the parameter `DISABLE_OPTIMIZED_EXECUTION` to "true" in the model - configuration may help in some cases to avoid these negative interactions - due to model specific caching and increase multiple instance performance. - * PyTorch does not support Tensor of Strings but it does support models that accept a List of Strings as input(s) / produces a List of String as output(s). For these models Triton allows users to pass String input(s)/recieve String output(s) using the String From f85fbab31856a118d1bcc5ae2176dc164cf2872a Mon Sep 17 00:00:00 2001 From: holidaydrien Date: Tue, 25 Oct 2022 20:34:39 -0400 Subject: [PATCH 19/76] fix typo (#81) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 616a204..e39152a 100644 --- a/README.md +++ b/README.md @@ -220,6 +220,6 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by * PyTorch does not support Tensor of Strings but it does support models that accept a List of Strings as input(s) / produces a List of String as output(s). For these models -Triton allows users to pass String input(s)/recieve String output(s) using the String +Triton allows users to pass String input(s)/receive String output(s) using the String datatype. As a limitation of using List instead of Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported for I/O of String type. From 4a971e6b6789310609ca84cf1c532084c1c6edc9 Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Wed, 23 Nov 2022 15:43:58 -0800 Subject: [PATCH 20/76] Update libraries path for PyTorch backend (#86) * Update 'libtorch' dependencies list * Update libraries and versions to sutisfy the dependency needs * Update library path * Remove ilp64 and iomp5 from build (#85) * Update library path * Remove ilp64 and iomp5 from build Co-authored-by: Misha Chornyi Co-authored-by: Iman Tabrizian --- CMakeLists.txt | 103 +++++++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 47 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c1a9bd..ff89da2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,23 +155,22 @@ endif() # TRITON_PYTORCH_ENABLE_TORCHTRT if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") set(LIBS_ARCH "aarch64") - set(CONDA_LIBS + set(LIBTORCH_LIBS "libopenblas.so.0" ) else() set(LIBS_ARCH "x86_64") - set(CONDA_LIBS - "libmkl_core.so" - "libmkl_gnu_thread.so" - "libmkl_intel_lp64.so" - "libmkl_intel_thread.so" - "libmkl_def.so" - "libmkl_vml_def.so" - "libmkl_rt.so" - "libmkl_avx2.so" - "libmkl_avx512.so" - "libmkl_sequential.so" - "libomp.so" + set(LIBTORCH_LIBS + "libmkl_avx2.so.1" + "libmkl_avx512.so.1" + "libmkl_core.so.1" + "libmkl_def.so.1" + "libmkl_gnu_thread.so.1" + "libmkl_intel_lp64.so.1" + "libmkl_intel_thread.so.1" + "libmkl_rt.so.1" + "libmkl_sequential.so.1" + "libmkl_vml_def.so.1" ) endif() set(OPENCV_LIBS @@ -180,7 +179,10 @@ set(OPENCV_LIBS "libopencv_highgui.so" "libopencv_imgcodecs.so" "libopencv_imgproc.so" - "libopencv_core.so" + "libopencv_core.so" + "libopencv_calib3d.so" + "libopencv_flann.so" + "libopencv_features2d.so" "libpng16.so" "libjpeg.so" ) @@ -189,12 +191,12 @@ set(OPENCV_LIBS # Without these, the framework/backend complains of missing libraries / symbols and # in some cases leads to segmentation faults. if (${TRITON_PYTORCH_DOCKER_BUILD}) - string(REPLACE ";" " " CONDA_LIBS_STR "${CONDA_LIBS}") + string(REPLACE ";" " " LIBTORCH_LIBS_STR "${LIBTORCH_LIBS}") add_custom_command( OUTPUT ${PT_LIBS} - ${CONDA_LIBS} + ${LIBTORCH_LIBS} ${OPENCV_LIBS} LICENSE.pytorch include/torch @@ -203,43 +205,47 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE} - COMMAND /bin/sh -c "for i in ${CONDA_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/opt/conda/lib/$i $i ; done" - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libc10.so libc10.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libc10_cuda.so libc10_cuda.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch.so libtorch.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so - COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/build/libtorchvision.so libtorchvision.so - COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi" - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true + COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/usr/local/lib/$i $i ; done" + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libc10.so libc10.so + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libc10_cuda.so libc10_cuda.so + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch.so libtorch.so + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so + COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi" + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/include include/torch + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/include include/torch COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/. COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_videoio.so.3.4.11 libopencv_videoio.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_highgui.so.3.4.11 libopencv_highgui.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_video.so.3.4.11 libopencv_video.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgcodecs.so.3.4.11 libopencv_imgcodecs.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgproc.so.3.4.11 libopencv_imgproc.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_core.so.3.4.11 libopencv_core.so + COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_videoio.so libopencv_videoio.so + COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_highgui.so libopencv_highgui.so + COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_video.so libopencv_video.so + COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_imgcodecs.so libopencv_imgcodecs.so + COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_imgproc.so libopencv_imgproc.so + COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_core.so libopencv_core.so + COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_calib3d.so libopencv_calib3d.so + COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_features2d.so libopencv_features2d.so + COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_flann.so libopencv_flann.so COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0 libpng16.so COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so - COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_def.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_def.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx2.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx2.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx512.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx512.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_vml_def.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_intel_thread.so libmkl_vml_def.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_vml_def.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so ]; then patchelf --add-needed libmkl_intel_lp64.so libmkl_intel_thread.so; fi" + COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx2.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx512.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx512.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_vml_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_intel_thread.so.1 libmkl_vml_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_vml_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so.1 ]; then patchelf --add-needed libmkl_intel_lp64.so.1 libmkl_intel_thread.so.1; fi" COMMAND docker rm pytorch_backend_ptlib COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}" VERBATIM ) - add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS}) + add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) add_library(ptlib SHARED IMPORTED GLOBAL) add_dependencies(ptlib ptlib_target) @@ -405,7 +411,7 @@ install( if (${TRITON_PYTORCH_DOCKER_BUILD}) set(PT_LIB_PATHS "") - FOREACH(plib ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS}) + FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}") ENDFOREACH(plib) @@ -424,7 +430,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) ) endif() # TRITON_PYTORCH_ENABLE_TORCHTRT - FOREACH(plib ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS}) + FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS}) install( CODE "EXECUTE_PROCESS( @@ -437,7 +443,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) ) ENDFOREACH(plib) - set(OPENCV_VERSION "3.4") + set(OPENCV_VERSION "406") install( CODE "EXECUTE_PROCESS( @@ -447,6 +453,9 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND ln -sf libopencv_imgcodecs.so libopencv_imgcodecs.so.${OPENCV_VERSION} COMMAND ln -sf libopencv_imgproc.so libopencv_imgproc.so.${OPENCV_VERSION} COMMAND ln -sf libopencv_core.so libopencv_core.so.${OPENCV_VERSION} + COMMAND ln -sf libopencv_calib3d.so libopencv_calib3d.so.${OPENCV_VERSION} + COMMAND ln -sf libopencv_features2d.so libopencv_features2d.so.${OPENCV_VERSION} + COMMAND ln -sf libopencv_flann.so libopencv_flann.so.${OPENCV_VERSION} COMMAND ln -sf libpng16.so libpng16.so.16 COMMAND ln -sf libjpeg.so libjpeg.so.8 RESULT_VARIABLE LINK_STATUS From 9e9a9a6c1fcdc9e7b54d728f686ebf9cbe336592 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 6 Dec 2022 16:20:52 -0800 Subject: [PATCH 21/76] re-enable nvfuser (#87) * re-enable nvfuser * WARN->INFO, formatting --- src/libtorch.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index 4e327a3..66d2908 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -432,7 +432,6 @@ ModelState::ParseParameters() .c_str()); } - // TODO Re-enable NvFuser once fixed // If 'ENABLE_NVFUSER' is not present in 'parameters' then no // update is made to 'enable_nvfuser'. bool enable_nvfuser = false; @@ -448,11 +447,9 @@ ModelState::ParseParameters() TRITONSERVER_ErrorDelete(err); } } else { - // Override, disable NvFuser till fixed - enable_nvfuser = false; enable_nvfuser_pair_ = {true, enable_nvfuser}; LOG_MESSAGE( - TRITONSERVER_LOG_WARN, (std::string("NvFuser is ") + + TRITONSERVER_LOG_INFO, (std::string("NvFuser is ") + (enable_nvfuser ? "enabled" : "disabled") + " for model instance '" + Name() + "'") .c_str()); @@ -1231,7 +1228,7 @@ ModelInstanceState::ProcessRequests( &compute_input_duration, compute_input_start_event_, compute_infer_start_event_), TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), - "Failed to capture elapsed time"); + "Failed to capture elapsed time"); LOG_IF_ERROR( ConvertCUDAStatusToTritonError( @@ -1239,7 +1236,7 @@ ModelInstanceState::ProcessRequests( &compute_infer_duration, compute_infer_start_event_, compute_output_start_event_), TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), - "Failed to capture elapsed time"); + "Failed to capture elapsed time"); compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); From f81ec73e802db216eda108763364ec4796c16ebb Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Sat, 31 Dec 2022 05:56:30 +0800 Subject: [PATCH 22/76] Remove unused input_memories variable (#89) --- src/libtorch.cc | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index 66d2908..90972f0 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -513,8 +513,7 @@ class ModelInstanceState : public BackendModelInstance { const uint32_t request_count, std::vector* responses, BackendInputCollector* collector, std::vector* input_names, - std::vector* input_tensors, - std::vector* input_memories, bool* cuda_copy); + std::vector* input_tensors, bool* cuda_copy); TRITONSERVER_Error* ReadOutputTensors( size_t total_batch_size, const std::vector& output_tensors, @@ -1102,7 +1101,6 @@ ModelInstanceState::ProcessRequests( std::vector input_names; std::vector input_tensors; - std::vector input_memories; bool cuda_copy = false; std::unique_ptr collector; if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { @@ -1124,8 +1122,7 @@ ModelInstanceState::ProcessRequests( responses, request_count, all_response_failed, SetInputTensors( total_batch_size, requests, request_count, &responses, - collector.get(), &input_names, &input_tensors, &input_memories, - &cuda_copy)); + collector.get(), &input_names, &input_tensors, &cuda_copy)); } #ifdef TRITON_ENABLE_GPU @@ -1149,14 +1146,6 @@ ModelInstanceState::ProcessRequests( Execute(&responses, request_count, &input_tensors, &output_tensors); } - // Free BackendMemory used for inputs - for (BackendMemory* mem : input_memories) { - if (mem != nullptr) { - delete mem; - } - } - input_memories.clear(); - // Verify output indices are valid with number of outputs after execution bool invalid_index = false; int max_index = output_tensors.size() - 1; @@ -1718,8 +1707,7 @@ ModelInstanceState::SetInputTensors( const uint32_t request_count, std::vector* responses, BackendInputCollector* collector, std::vector* input_names, - std::vector* input_tensors, - std::vector* input_memories, bool* cuda_copy) + std::vector* input_tensors, bool* cuda_copy) { // InferenceMode should be used to guard all tensors operations torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); From 2559db96d5fb9617d9e10b2926158d59cb61b29b Mon Sep 17 00:00:00 2001 From: dyastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Wed, 8 Feb 2023 11:25:30 -0800 Subject: [PATCH 23/76] Add linear algebra library (#92) --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ff89da2..d757874 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,6 +136,7 @@ set(PT_LIBS "libtorch.so" "libtorch_cpu.so" "libtorch_cuda.so" + "libtorch_cuda_linalg.so" "libtorch_global_deps.so" ) @@ -211,6 +212,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch.so libtorch.so COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so + COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so From c077c862c045f24fec6274163a9913b609f78bcb Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Wed, 8 Feb 2023 15:18:04 -0800 Subject: [PATCH 24/76] Add check for sequence data type (#93) --- src/libtorch.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index 90972f0..6f23faa 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -1,4 +1,4 @@ -// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -741,6 +741,15 @@ ModelInstanceState::ValidateTypedSequenceControl( } } + // check if the data type is supported by PyTorch + if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + "' type '" + tensor_datatype + + "' is not supported by PyTorch.") + .c_str()); + } + ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); input_index_map_[tensor_name] = ip_index; } From 4a8a870f0c759ec6b0b23594881ba0ae384b60f3 Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Fri, 10 Feb 2023 09:21:32 +0800 Subject: [PATCH 25/76] Update pytorch docker image tag to 22.12 in README.md (#91) * Update pytorch docker image tag to 22.12 in README.md * Update the copyright year in README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e39152a..c832ae5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ +# PyTorch (LibTorch) Backend + [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) -# PyTorch (LibTorch) Backend +The Triton backend for +[PyTorch](https://github.com/pytorch/pytorch) +is designed to run +[TorchScript](https://pytorch.org/docs/stable/jit.html) +models using the PyTorch C++ API. +All models created in PyTorch using the python API must be traced/scripted to produce a TorchScript model. + +You can learn more about Triton backends in the +[Triton Backend](https://github.com/triton-inference-server/backend) +repository. + +Ask questions or report problems using +[Triton Server issues](https://github.com/triton-inference-server/server/issues). -The Triton backend for [PyTorch](https://github.com/pytorch/pytorch). -You can learn more about Triton backends in the [backend -repo](https://github.com/triton-inference-server/backend). Ask -questions or report problems on the [issues -page](https://github.com/triton-inference-server/server/issues). -This backend is designed to run [TorchScript](https://pytorch.org/docs/stable/jit.html) -models using the PyTorch C++ API. All models created in PyTorch -using the python API must be traced/scripted to produce a TorchScript -model. - -Where can I ask general questions about Triton and Triton backends? -Be sure to read all the information below as well as the [general -Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server) -available in the main [server](https://github.com/triton-inference-server/server) -repo. If you don't find your answer there you can ask questions on the -main Triton [issues page](https://github.com/triton-inference-server/server/issues). +Be sure to read all the information below as well as the +[general Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server) +available in the [Triton Server](https://github.com/triton-inference-server/server) repository. ## Build the PyTorch Backend -Use a recent cmake to build. First install the required dependencies. +Use a recent cmake to build. +First install the required dependencies. -``` -$ apt-get install rapidjson-dev python3-dev python3-pip -$ pip3 install patchelf==0.17.2 +```bash +apt-get install rapidjson-dev python3-dev python3-pip +pip3 install patchelf==0.17.2 ``` -An appropriate PyTorch container from [NGC](https://ngc.nvidia.com) must be used. -For example, to build a backend that uses the 23.04 version of the PyTorch -container from NGC: +An appropriate PyTorch container from [NVIDIA NGC Catalog](https://ngc.nvidia.com) must be used. +For example, to build a backend that uses the 23.04 version of the PyTorch container from NGC: -``` -$ mkdir build -$ cd build -$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:23.04-py3" .. -$ make install +```bash +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:23.04-py3" .. +make install ``` -The following required Triton repositories will be pulled and used in -the build. By default, the "main" branch/tag will be used for each repo -but the listed CMake argument can be used to override. +The following required Triton repositories will be pulled and used in the build. +By default, the `main` head will be used for each repository but the listed CMake argument can be used to override the value. -* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag] -* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag] -* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag] +* triton-inference-server/backend: `-DTRITON_BACKEND_REPO_TAG=[tag]` +* triton-inference-server/core: `-DTRITON_CORE_REPO_TAG=[tag]` +* triton-inference-server/common: `-DTRITON_COMMON_REPO_TAG=[tag]` ## Build the PyTorch Backend With Custom PyTorch -Currently, Triton requires that a specially patched version of -PyTorch be used with the PyTorch backend. The full source for -these PyTorch versions are available as Docker images from -[NGC](https://ngc.nvidia.com). For example, the PyTorch version -compatible with the 25.09 release of Triton is available as -nvcr.io/nvidia/pytorch:25.09-py3. +Currently, Triton requires that a specially patched version of PyTorch be used with the PyTorch backend. +The full source for these PyTorch versions are available as Docker images from +[NGC](https://ngc.nvidia.com). -Copy over the LibTorch and Torchvision headers and libraries from the +For example, the PyTorch version compatible with the 25.09 release of Triton is available as `nvcr.io/nvidia/pytorch:25.09-py3` which supports PyTorch version `2.9.0a0`. + +> [!NOTE] +> Additional details and version information can be found in the container's +> [release notes](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-09.html#rel-25-09). + +Copy over the LibTorch and TorchVision headers and libraries from the [PyTorch NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) -into local directories. You can see which headers and libraries -are needed/copied from the docker. +into local directories. +You can see which headers and libraries are needed/copied from the docker. -``` -$ mkdir build -$ cd build -$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="/torch;/torch/torch/csrc/api/include;/torchvision" -DTRITON_PYTORCH_LIB_PATHS="" .. -$ make install +```bash +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="/torch;/torch/torch/csrc/api/include;/torchvision" -DTRITON_PYTORCH_LIB_PATHS="" .. +make install ``` ## Using the PyTorch Backend -### Parameters +### PyTorch 2.0 Models -Triton exposes some flags to control the execution mode of the TorchScript models through -the Parameters section of the model's `config.pbtxt` file. +The model repository should look like: -* `DISABLE_OPTIMIZED_EXECUTION`: Boolean flag to disable the optimized execution -of TorchScript models. By default, the optimized execution is always enabled. +```bash +model_repository/ +`-- model_directory + |-- 1 + | |-- model.py + | `-- [model.pt] + `-- config.pbtxt +``` -The initial calls to a loaded TorchScript model take extremely long. Due to this longer -model warmup [issue](https://github.com/pytorch/pytorch/issues/57894), Triton also allows -execution of models without these optimizations. In some models, optimized execution -does not benefit performance as seen [here](https://github.com/pytorch/pytorch/issues/19978) -and in other cases impacts performance negatively, as seen [here](https://github.com/pytorch/pytorch/issues/53824). +The `model.py` contains the class definition of the PyTorch model. +The class should extend the +[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). +The `model.pt` may be optionally provided which contains the saved +[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference) +of the model. -The section of model config file specifying this parameter will look like: +### TorchScript Models -``` -parameters: { -key: "DISABLE_OPTIMIZED_EXECUTION" - value: { - string_value: "true" - } -} +The model repository should look like: + +```bash +model_repository/ +`-- model_directory + |-- 1 + | `-- model.pt + `-- config.pbtxt ``` -* `INFERENCE_MODE`: Boolean flag to enable the Inference Mode execution -of TorchScript models. By default, the inference mode is enabled. +The `model.pt` is the TorchScript model file. -[InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new -RAII guard analogous to NoGradMode to be used when you are certain your operations -will have no interactions with autograd. Compared to NoGradMode, code run under -this mode gets better performance by disabling autograd. +## Configuration -Please note that in some models, InferenceMode might not benefit performance -and in fewer cases might impact performance negatively. +Triton exposes some flags to control the execution mode of the TorchScript models through the `Parameters` section of the model's `config.pbtxt` file. -The section of model config file specifying this parameter will look like: +### Parameters -``` -parameters: { -key: "INFERENCE_MODE" - value: { - string_value: "true" - } -} -``` +* `DISABLE_OPTIMIZED_EXECUTION`: + Boolean flag to disable the optimized execution of TorchScript models. + By default, the optimized execution is always enabled. -* `DISABLE_CUDNN`: Boolean flag to disable the cuDNN library. By default, cuDNN is enabled. + The initial calls to a loaded TorchScript model take a significant amount of time. + Due to this longer model warmup + ([pytorch #57894](https://github.com/pytorch/pytorch/issues/57894)), + Triton also allows execution of models without these optimizations. + In some models, optimized execution does not benefit performance + ([pytorch #19978](https://github.com/pytorch/pytorch/issues/19978)) + and in other cases impacts performance negatively + ([pytorch #53824](https://github.com/pytorch/pytorch/issues/53824)). -[cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for -deep neural networks. cuDNN provides highly tuned implementations for standard routines. + The section of model config file specifying this parameter will look like: -Typically, models run with cuDNN enabled are faster. However there are some exceptions -where using cuDNN can be slower, cause higher memory usage or result in errors. + ```proto + parameters: { + key: "DISABLE_OPTIMIZED_EXECUTION" + value: { string_value: "true" } + } + ``` +* `INFERENCE_MODE`: -The section of model config file specifying this parameter will look like: + Boolean flag to enable the Inference Mode execution of TorchScript models. + By default, the inference mode is enabled. -``` -parameters: { -key: "DISABLE_CUDNN" - value: { - string_value: "true" - } -} -``` + [InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new RAII guard analogous to `NoGradMode` to be used when you are certain your operations will have no interactions with autograd. + Compared to `NoGradMode`, code run under this mode gets better performance by disabling autograd. -* `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to -share weights. This optimization should not be used with stateful models. If not specified, -weight sharing is disabled. + Please note that in some models, InferenceMode might not benefit performance and in fewer cases might impact performance negatively. -The section of model config file specifying this parameter will look like: + To enable inference mode, use the configuration example below: -``` -parameters: { -key: "ENABLE_WEIGHT_SHARING" - value: { - string_value: "true" - } -} -``` + ```proto + parameters: { + key: "INFERENCE_MODE" + value: { string_value: "true" } + } + ``` -* `ENABLE_CACHE_CLEANING`: Boolean flag to enable CUDA cache cleaning after each model execution. -If not specified, cache cleaning is disabled. This flag has no effect if model is on CPU. -Setting this flag to true will negatively impact the performance due to additional CUDA cache -cleaning operation after each model execution. Therefore, you should only use this flag if you -serve multiple models with Triton and encounter CUDA out of memory issue during model executions. +* `DISABLE_CUDNN`: -The section of model config file specifying this parameter will look like: + Boolean flag to disable the cuDNN library. + By default, cuDNN is enabled. -``` -parameters: { -key: "ENABLE_CACHE_CLEANING" - value: { - string_value:"true" - } -} -``` + [cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for deep neural networks. + It provides highly tuned implementations for standard routines. + + Typically, models run with cuDNN enabled execute faster. + However there are some exceptions where using cuDNN can be slower, cause higher memory usage, or result in errors. + + To disable cuDNN, use the configuration example below: + + ```proto + parameters: { + key: "DISABLE_CUDNN" + value: { string_value: "true" } + } + ``` + +* `ENABLE_WEIGHT_SHARING`: + + Boolean flag to enable model instances on the same device to share weights. + This optimization should not be used with stateful models. + If not specified, weight sharing is disabled. + + To enable weight sharing, use the configuration example below: + + ```proto + parameters: { + key: "ENABLE_WEIGHT_SHARING" + value: { string_value: "true" } + } + ``` + +* `ENABLE_CACHE_CLEANING`: + + Boolean flag to enable CUDA cache cleaning after each model execution. + If not specified, cache cleaning is disabled. + This flag has no effect if model is on CPU. + + Setting this flag to true will likely negatively impact the performance due to additional CUDA cache cleaning operation after each model execution. + Therefore, you should only use this flag if you serve multiple models with Triton and encounter CUDA out-of-memory issues during model executions. + + To enable cleaning of the CUDA cache after every execution, use the configuration example below: + + ```proto + parameters: { + key: "ENABLE_CACHE_CLEANING" + value: { string_value: "true" } + } + ``` * `INTER_OP_THREAD_COUNT`: -PyTorch allows using multiple CPU threads during TorchScript model inference. -One or more inference threads execute a model's forward pass on the given -inputs. Each inference thread invokes a JIT interpreter that executes the ops -of a model inline, one by one. This parameter sets the size of this thread -pool. The default value of this setting is the number of cpu cores. Please refer -to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html) -document on how to set this parameter properly. + PyTorch allows using multiple CPU threads during TorchScript model inference. + One or more inference threads execute a model’s forward pass on the given inputs. + Each inference thread invokes a JIT interpreter that executes the ops of a model inline, one by one. -The section of model config file specifying this parameter will look like: + This parameter sets the size of this thread pool. + The default value of this setting is the number of cpu cores. -``` -parameters: { -key: "INTER_OP_THREAD_COUNT" - value: { - string_value:"1" - } -} -``` + > [!TIP] + > Refer to + > [CPU Threading TorchScript](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html) + > on how to set this parameter properly. + + To set the inter-op thread count, use the configuration example below: + + ```proto + parameters: { + key: "INTER_OP_THREAD_COUNT" + value: { string_value: "1" } + } + ``` > [!NOTE] > This parameter is set globally for the PyTorch backend. @@ -225,70 +265,68 @@ key: "INTER_OP_THREAD_COUNT" * `INTRA_OP_THREAD_COUNT`: -In addition to the inter-op parallelism, PyTorch can also utilize multiple threads -within the ops (intra-op parallelism). This can be useful in many cases, including -element-wise ops on large tensors, convolutions, GEMMs, embedding lookups and -others. The default value for this setting is the number of CPU cores. Please refer -to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html) -document on how to set this parameter properly. + In addition to the inter-op parallelism, PyTorch can also utilize multiple threads within the ops (intra-op parallelism). + This can be useful in many cases, including element-wise ops on large tensors, convolutions, GEMMs, embedding lookups and others. -The section of model config file specifying this parameter will look like: + The default value for this setting is the number of CPU cores. -``` -parameters: { -key: "INTRA_OP_THREAD_COUNT" - value: { - string_value:"1" - } -} -``` + > [!TIP] + > Refer to + > [CPU Threading TorchScript](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html) + > on how to set this parameter properly. -> [!NOTE] -> This parameter is set globally for the PyTorch backend. -> The value from the first model config file that specifies this parameter will be used. -> Subsequent values from other model config files, if different, will be ignored. + To set the intra-op thread count, use the configuration example below: + + ```proto + parameters: { + key: "INTRA_OP_THREAD_COUNT" + value: { string_value: "1" } + } + ``` -* Additional Optimizations: Three additional boolean parameters are available to disable -certain Torch optimizations that can sometimes cause latency regressions in models with -complex execution modes and dynamic shapes. If not specified, all are enabled by default. +* **Additional Optimizations**: + + Three additional boolean parameters are available to disable certain Torch optimizations that can sometimes cause latency regressions in models with complex execution modes and dynamic shapes. + If not specified, all are enabled by default. `ENABLE_JIT_EXECUTOR` `ENABLE_JIT_PROFILING` -### PyTorch 2.0 Models +### Model Instance Group Kind -The model repository should look like: +The PyTorch backend supports the following kinds of +[Model Instance Groups](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) +where the input tensors are placed as follows: -```bash -model_repository/ -`-- model_directory - |-- 1 - | |-- model.py - | `-- [model.pt] - `-- config.pbtxt -``` +* `KIND_GPU`: -The `model.py` contains the class definition of the PyTorch model. -The class should extend the -[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). -The `model.pt` may be optionally provided which contains the saved -[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference) -of the model. + Inputs are prepared on the GPU device associated with the model instance. -### TorchScript Models +* `KIND_CPU`: -The model repository should look like: + Inputs are prepared on the CPU. -```bash -model_repository/ -`-- model_directory - |-- 1 - | `-- model.pt - `-- config.pbtxt -``` +* `KIND_MODEL`: -The `model.pt` is the TorchScript model file. + Inputs are prepared on the CPU. + When loading the model, the backend does not choose the GPU device for the model; + instead, it respects the device(s) specified in the model and uses them as they are during inference. + + This is useful when the model internally utilizes multiple GPUs, as demonstrated in + [this example model](https://github.com/triton-inference-server/server/blob/main/qa/L0_libtorch_instance_group_kind_model/gen_models.py). + + > [!IMPORTANT] + > If a device is not specified in the model, the backend uses the first available GPU device. + +To set the model instance group, use the configuration example below: + +```proto +instance_group { + count: 2 + kind: KIND_GPU +} +``` ### Customization @@ -329,69 +367,46 @@ parameters: { } ``` -### Support +## Important Notes -#### Model Instance Group Kind +* The execution of PyTorch model on GPU is asynchronous in nature. + See + [CUDA Asynchronous Execution](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution) + for additional details. + Consequently, an error in PyTorch model execution may be raised during the next few inference requests to the server. + Setting environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will help in correctly debugging failing cases by forcing synchronous execution. -The PyTorch backend supports the following kinds of -[Model Instance Groups](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) -where the input tensors are placed as follows: + * The PyTorch model in such cases may or may not recover from the failed state and a restart of the server may be required to continue serving successfully. -* `KIND_GPU`: Inputs are prepared on the GPU device associated with the model -instance. - -* `KIND_CPU`: Inputs are prepared on the CPU. - -* `KIND_MODEL`: Inputs are prepared on the CPU. When loading the model, the -backend does not choose the GPU device for the model; instead, it respects the -device(s) specified in the model and uses them as they are during inference. -This is useful when the model internally utilizes multiple GPUs, as demonstrated -in this -[example model](https://github.com/triton-inference-server/server/blob/main/qa/L0_libtorch_instance_group_kind_model/gen_models.py). -If no device is specified in the model, the backend uses the first available -GPU device. This feature is available starting in the 23.06 release. - -### Important Notes - -* The execution of PyTorch model on GPU is asynchronous in nature. See - [here](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution) - for more details. Consequently, an error in PyTorch model execution may - be raised during the next few inference requests to the server. Setting - environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will - help in correctly debugging failing cases by forcing synchronous execution. - * The PyTorch model in such cases may or may not recover from the failed - state and a restart of the server may be required to continue serving - successfully. - -* PyTorch does not support Tensor of Strings but it does support models that -accept a List of Strings as input(s) / produces a List of String as output(s). -For these models Triton allows users to pass String input(s)/receive String -output(s) using the String datatype. As a limitation of using List instead of -Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported -for I/O of String type. +* PyTorch does not support Tensor of Strings but it does support models that accept a List of Strings as input(s) / produces a List of String as output(s). + For these models Triton allows users to pass String input(s)/receive String output(s) using the String datatype. + As a limitation of using List instead of Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported for I/O of String type. * In a multi-GPU environment, a potential runtime issue can occur when using -[Tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) -to generate a -[TorchScript](https://pytorch.org/docs/stable/jit.html) model. This issue -arises due to a device mismatch between the model instance and the tensor. By -default, Triton creates a single execution instance of the model for each -available GPU. The runtime error occurs when a request is sent to a model -instance with a different GPU device from the one used during the TorchScript -generation process. To address this problem, it is highly recommended to use -[Scripting](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script) -instead of Tracing for model generation in a multi-GPU environment. Scripting -avoids the device mismatch issue and ensures compatibility with different GPUs -when used with Triton. However, if using Tracing is unavoidable, there is a -workaround available. You can explicitly specify the GPU device for the model -instance in the -[model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) -to ensure that the model instance and the tensors used for inference are -assigned to the same GPU device as on which the model was traced. - -* Python functions optimizable by `torch.compile` may not be served directly in the `model.py` file, they need to be enclosed by a class extending the - [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). + [Tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) + to generate a + [TorchScript](https://pytorch.org/docs/stable/jit.html) + model. + This issue arises due to a device mismatch between the model instance and the tensor. -* Model weights cannot be shared across multiple instances on the same GPU device. + By default, Triton creates a single execution instance of the model for each available GPU. + The runtime error occurs when a request is sent to a model instance with a different GPU device from the one used during the TorchScript generation process. + + To address this problem, it is highly recommended to use + [Scripting](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script) + instead of Tracing for model generation in a multi-GPU environment. + Scripting avoids the device mismatch issue and ensures compatibility with different GPUs when used with Triton. + + However, if using Tracing is unavoidable, there is a workaround available. + You can explicitly specify the GPU device for the model instance in the + [model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) + to ensure that the model instance and the tensors used for inference are assigned to the same GPU device as on which the model was traced. * When using `KIND_MODEL` as model instance kind, the default device of the first parameter on the model is used. + +> [!WARNING] +> +> * Python functions optimizable by `torch.compile` may not be served directly in the `model.py` file, they need to be enclosed by a class extending the + [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). +> +> * Model weights cannot be shared across multiple instances on the same GPU device. From 92692f8f09cd611bef06c01f1d4ea7933e22a323 Mon Sep 17 00:00:00 2001 From: J Wyman Date: Wed, 29 Oct 2025 14:48:39 -0400 Subject: [PATCH 72/76] docs: Correct README Instructions (#164) This change corrects the instruction for how to use PyTorch 2 with the backend. --- README.md | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index dc227e1..ccc803c 100644 --- a/README.md +++ b/README.md @@ -103,23 +103,23 @@ make install ### PyTorch 2.0 Models +PyTorch 2.0 features are available. +However, Triton's PyTorch backend requires a serialized representation of the model in the form a `model.pt` file. +The serialized representation of the model can be generated using PyTorch's +[`torch.save()`](https://docs.pytorch.org/tutorials/beginner/saving_loading_models.html#id1) +function to generate the `model.pt` file. + The model repository should look like: ```bash model_repository/ `-- model_directory |-- 1 - | |-- model.py - | `-- [model.pt] + | `-- model.pt `-- config.pbtxt ``` -The `model.py` contains the class definition of the PyTorch model. -The class should extend the -[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). -The `model.pt` may be optionally provided which contains the saved -[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference) -of the model. +Where `model.pt` is the serialized representation of the model. ### TorchScript Models @@ -139,6 +139,17 @@ The `model.pt` is the TorchScript model file. Triton exposes some flags to control the execution mode of the TorchScript models through the `Parameters` section of the model's `config.pbtxt` file. +### Configuration Options + +* `default_model_name`: + Instructs the Triton PyTorch backend to load the model from a file of the given name. + + The model config specifying the option would look like: + + ```proto + default_model_name: "another_file_name.pt" + ``` + ### Parameters * `DISABLE_OPTIMIZED_EXECUTION`: From a95f663e67a713c792e48f2e3c0c6d282e63de84 Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Mon, 3 Nov 2025 09:43:09 -0800 Subject: [PATCH 73/76] Adding libtorch_nvshmem.so (#162) (#166) * Adding libtorch_nvshmem.so * change: CPU only build doesn't have CUDA_VERSION environment variable. Using flag to control library inclusion. * Removing generation expression --- CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3afe90b..3ec2d55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,7 @@ option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) option(TRITON_PYTORCH_ENABLE_TORCHTRT "Enable TorchTRT support" OFF) option(TRITON_PYTORCH_ENABLE_TORCHVISION "Enable Torchvision support" ON) +option(TRITON_PYTORCH_NVSHMEM "Enable NVSHMEM support" ON) set(TRITON_PYTORCH_DOCKER_IMAGE "" CACHE STRING "Docker image containing the PyTorch build required by backend.") set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes") @@ -162,6 +163,13 @@ set(PT_LIBS "libjpeg.so.62" ) +if (${TRITON_PYTORCH_NVSHMEM}) + set(PT_LIBS + ${PT_LIBS} + "libtorch_nvshmem.so" + ) +endif() # TRITON_PYTORCH_NVSHMEM + if (${TRITON_PYTORCH_ENABLE_TORCHVISION}) set(PT_LIBS ${PT_LIBS} @@ -238,6 +246,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_global_deps.so libtorch_global_deps.so COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so + COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_NVSHMEM} = 'ON' ]; then docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_nvshmem.so libtorch_nvshmem.so; fi" COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -a -L pytorch_backend_ptlib:/usr/local/lib64/libtorchvision.so libtorchvision.so; else docker cp -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi; fi" COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi" COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi" From 45bd8e5f6c5b853d9618b5eb06fbf17188694ceb Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Tue, 4 Nov 2025 10:19:28 -0800 Subject: [PATCH 74/76] fix(pre-commit): update hooks versions (#169) --- .github/workflows/pre-commit.yml | 10 +++++----- .pre-commit-config.yaml | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index ab4bd95..4fa1873 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -31,8 +31,8 @@ on: jobs: pre-commit: - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 - - uses: pre-commit/action@v3.0.0 + - uses: actions/checkout@v5.0.0 + - uses: actions/setup-python@v6.0.0 + - uses: pre-commit/action@v3.0.1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 298baab..3c76a6e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. repos: -- repo: https://github.com/timothycrosley/isort +- repo: https://github.com/PyCQA/isort rev: 5.12.0 hooks: - id: isort @@ -36,7 +36,7 @@ repos: - id: black types_or: [python, cython] - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 + rev: 7.3.0 hooks: - id: flake8 args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] @@ -57,7 +57,7 @@ repos: # More details about these pre-commit hooks here: # https://pre-commit.com/hooks.html - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v6.0.0 hooks: - id: check-case-conflict - id: check-executables-have-shebangs From cb9336c0c05c349bfd54a611680840cabb2b1a03 Mon Sep 17 00:00:00 2001 From: J Wyman Date: Tue, 4 Nov 2025 16:59:30 -0500 Subject: [PATCH 75/76] maintenance: Separate Code into Separate Files (#163) * maintenance: Separate Code into Separate Files This change breaks the monolithic src/libtorch.cc into multiple files, with a modern separation of classes into separate header and code files. * Accept Rename Renamed 'string_utilities.*' to 'string_utils.*' as requested. --- CMakeLists.txt | 3 + src/libtorch.cc | 2493 +---------------------------------- src/libtorch.hh | 59 + src/model_instance_state.cc | 1632 +++++++++++++++++++++++ src/model_instance_state.hh | 178 +++ src/model_state.cc | 495 +++++++ src/model_state.hh | 131 ++ src/naming_convention.hh | 40 + src/string_utils.cc | 254 ++++ src/string_utils.hh | 106 ++ 10 files changed, 2902 insertions(+), 2489 deletions(-) create mode 100644 src/libtorch.hh create mode 100644 src/model_instance_state.cc create mode 100644 src/model_instance_state.hh create mode 100644 src/model_state.cc create mode 100644 src/model_state.hh create mode 100644 src/naming_convention.hh create mode 100644 src/string_utils.cc create mode 100644 src/string_utils.hh diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ec2d55..5b0e399 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -289,6 +289,9 @@ add_library( src/libtorch.cc src/libtorch_utils.cc src/libtorch_utils.h + src/model_instance_state.cc + src/model_state.cc + src/string_utils.cc ) add_library( diff --git a/src/libtorch.cc b/src/libtorch.cc index c873375..500f1f5 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -24,2498 +24,13 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include - -#include -#include -#include - -#include "libtorch_utils.h" -#include "triton/backend/backend_common.h" -#include "triton/backend/backend_input_collector.h" -#include "triton/backend/backend_memory.h" -#include "triton/backend/backend_model.h" -#include "triton/backend/backend_model_instance.h" -#include "triton/backend/backend_output_responder.h" -#include "triton/common/nvtx.h" -#include "triton/core/tritonbackend.h" - -#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION -// Suppress warnings in torch headers -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wsign-compare" -#pragma warning(push, 0) -#include -#include // Torchvision header -#pragma warning(pop) -#pragma GCC diagnostic pop -#endif // TRITON_PYTORCH_ENABLE_TORCHVISION - -#ifdef TRITON_ENABLE_GPU -#include -#include -#include -#endif // TRITON_ENABLE_GPU - -// for thread control -// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api -// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133 -#include - +#include "libtorch.hh" // // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API. // -namespace { -std::once_flag pytorch_interop_threads_flag; -std::once_flag pytorch_intraop_threads_flag; -} // namespace - -namespace triton { namespace backend { namespace pytorch { - -// -// ModelState -// -// State associated with a model that is using this backend. An object -// of this class is created and associated with each -// TRITONBACKEND_Model. -// -class ModelState : public BackendModel { - public: - static TRITONSERVER_Error* Create( - TRITONBACKEND_Model* triton_model, ModelState** state); - virtual ~ModelState() = default; - - // Load a TorchScript model using 'artifact_name' as the name for the - // TorchScript file. Return in 'model_path' the full path to the - // TorchScript file, return in 'torch_model' the Torch Module - // representing the model. - TRITONSERVER_Error* LoadModel( - const std::string& artifact_name, const torch::Device device, - std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, - std::shared_ptr* torch_model); - - bool EnabledOptimizedExecution() { return enable_optimized_execution_; } - const std::pair& EnabledTensorExprFuser() const - { - return enable_tensor_fuser_pair_; - } - const std::pair& EnabledJitProfiling() const - { - return enable_jit_profiling_pair_; - } - const std::pair& EnabledJitExecutor() const - { - return enable_jit_executor_pair_; - } - bool EnabledInferenceMode() { return enable_inference_mode_; } - bool EnabledCudnn() { return enable_cudnn_; } - bool EnabledCacheCleaning() { return enable_cache_cleaning_; } - - bool EnabledWeightSharing() { return enable_weight_sharing_; } - const std::map>& ModelOutputs() - { - return model_outputs_; - } - - private: - ModelState(TRITONBACKEND_Model* triton_model); - TRITONSERVER_Error* AutoCompleteConfig(); - - // Parses and validates parameters in config - TRITONSERVER_Error* ParseParameters(); - - // Flag to indicate whether optimized execution is enabled. Defaults to true. - bool enable_optimized_execution_; - - // Flag to indicate whether inference mode is enabled. Defaults to false. - bool enable_inference_mode_; - - // Flag to indicate whether cudnn is enabled. Defaults to true. - bool enable_cudnn_; - - // Flag to indicate whether cache cleaning after each run is enabled. - // Defaults to false. - bool enable_cache_cleaning_; - - // Flag to indicate whether weight sharing is enabled. Defaults to false. - bool enable_weight_sharing_; - - // Flag pairs to indicate if various JIT settings are set and - // enabled respectively. Defaults to (false, true). Default behavior - // is to do nothing if not explicitly set. - std::pair enable_tensor_fuser_pair_; - std::pair enable_jit_profiling_pair_; - std::pair enable_jit_executor_pair_; - - // Model mapping for shared TorchScript model across all instances on the - // same device. The key is a pair of isGPU and device index. - std::map< - std::pair, std::shared_ptr> - torch_models_; - - // model_outputs is a map that contains unique outputs that the model must - // provide. The first pair is the model output index and the second is - // the index in the model state, -1 is used if one is not required. - // In the model configuration, the output in the state configuration - // can have intersection with the outputs section of the model. If an output - // is specified both in the output section and state section, it indicates - // that the backend must return the output state to the client too. - std::map> model_outputs_; -}; - -TRITONSERVER_Error* -ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) -{ - try { - *state = new ModelState(triton_model); - } - catch (const BackendModelException& ex) { - RETURN_ERROR_IF_TRUE( - ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, - std::string("unexpected nullptr in BackendModelException")); - RETURN_IF_ERROR(ex.err_); - } - - // Auto-complete the configuration if requested... - bool auto_complete_config = false; - RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( - triton_model, &auto_complete_config)); - if (auto_complete_config) { - RETURN_IF_ERROR((*state)->AutoCompleteConfig()); - RETURN_IF_ERROR((*state)->SetModelConfig()); - } - - auto& model_outputs = (*state)->model_outputs_; - // Parse the output states in the model configuration - triton::common::TritonJson::Value sequence_batching; - if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) { - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - for (size_t i = 0; i < states.ArraySize(); i++) { - triton::common::TritonJson::Value state; - RETURN_IF_ERROR(states.IndexAsObject(i, &state)); - std::string output_state_name; - RETURN_IF_ERROR( - state.MemberAsString("output_name", &output_state_name)); - auto it = model_outputs.find(output_state_name); - if (it == model_outputs.end()) { - model_outputs.insert({output_state_name, std::make_pair(-1, i)}); - } else { - it->second.second = i; - } - } - } - } - - // Parse the output names in the model configuration - triton::common::TritonJson::Value outputs; - RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs)); - for (size_t i = 0; i < outputs.ArraySize(); i++) { - triton::common::TritonJson::Value output; - THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output)); - - // Use names from ModelConfig by reference since the model - // config will persist longer than this inference execution. - std::string output_name; - THROW_IF_BACKEND_INSTANCE_ERROR( - output.MemberAsString("name", &output_name)); - - auto it = model_outputs.find(output_name); - if (it == model_outputs.end()) { - model_outputs.insert({output_name, std::make_pair(i, -1)}); - } else { - it->second.first = i; - } - } - - RETURN_IF_ERROR((*state)->ParseParameters()); - - return nullptr; // success -} - -ModelState::ModelState(TRITONBACKEND_Model* triton_model) - : BackendModel(triton_model), enable_optimized_execution_(true), - enable_inference_mode_(true), enable_cudnn_(true), - enable_cache_cleaning_(false), enable_weight_sharing_(false), - enable_tensor_fuser_pair_({false, true}), - enable_jit_profiling_pair_({false, true}), - enable_jit_executor_pair_({false, true}) -{ -} - -TRITONSERVER_Error* -ModelState::LoadModel( - const std::string& artifact_name, const torch::Device device, - std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, - std::shared_ptr* torch_model) -{ - // Find the TorchScript file that describes the model. If the model - // configuration doesn't have an explicit model file specified then - // use the default name ("model.pt"). - std::string cc_model_filename = artifact_name; - if (cc_model_filename.empty()) { - cc_model_filename = "model.pt"; - } - - *model_path = JoinPath( - {RepositoryPath(), std::to_string(Version()), cc_model_filename}); - - { - bool exists; - RETURN_IF_ERROR(FileExists(*model_path, &exists)); - RETURN_ERROR_IF_FALSE( - exists, TRITONSERVER_ERROR_UNAVAILABLE, - std::string("unable to find '") + *model_path + - "' for model instance '" + Name() + "'"); - } - - // If weight sharing is enabled, skip loading model if - // it is already available on the target device - std::pair device_pair; - if (enable_weight_sharing_) { - device_pair = std::make_pair(!device.is_cpu(), device.index()); - auto mit = torch_models_.find(device_pair); - if (mit != torch_models_.end()) { - *torch_model = mit->second; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Reusing TorchScript model for instance '") + Name() + - "'") - .c_str()); - return nullptr; // success - } - } - - // Serialize the torch model to string - std::string model_data_str; - RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str)); - - // InferenceMode should be used to guard all tensors operations including - // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html - torch::InferenceMode infer_guard(EnabledInferenceMode()); - - try { - std::istringstream model_stream(model_data_str); - if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { - // Load the model without selecting a device. - torch_model->reset( - new torch::jit::Module(torch::jit::load(model_stream))); - } else { - torch_model->reset( - new torch::jit::Module(torch::jit::load(model_stream, device))); - } - } - catch (const std::exception& ex) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("failed to load model '" + Name() + "': " + ex.what()).c_str()); - } - - if (enable_weight_sharing_) { - if (!((torch_models_.emplace(device_pair, *torch_model)).second)) { - std::string type = device.is_cpu() ? "CPU" : "GPU"; - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("Model already found on target ") + type + " device " + - "(id " + std::to_string(device.index()) + ") for '" + Name() + "'") - .c_str()); - } - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelState::AutoCompleteConfig() -{ - // Auto-complete configuration is not supported since PyTorch does not - // store/capture sufficient model metadata so just log error instead. - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("skipping model configuration auto-complete for '") + - Name() + "': not supported for pytorch backend") - .c_str()); - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelState::ParseParameters() -{ - triton::common::TritonJson::Value params; - bool status = model_config_.Find("parameters", ¶ms); - if (status) { - // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no - // update is made to 'enable_optimized_execution_'. - bool disable_optimized_execution = false; - TRITONSERVER_Error* err = ParseParameter( - params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - enable_optimized_execution_ = !disable_optimized_execution; - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Optimized execution is ") + - (enable_optimized_execution_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then - // no update is made to 'enable_cache_cleaning_'. - err = ParseParameter( - params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Cache Cleaning is ") + - (enable_cache_cleaning_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made - // to 'enable_inference_mode_'. - err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Inference Mode is ") + - (enable_inference_mode_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made - // to 'enable_cudnn_'. - bool disable_cudnn = false; - err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - enable_cudnn_ = !disable_cudnn; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no - // update is made to 'enable_tensor_fuser'. - bool enable_tensor_fuser = false; - err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - enable_tensor_fuser_pair_ = {true, enable_tensor_fuser}; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Tensor fuser is ") + - (enable_tensor_fuser ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no - // update is made to 'enable_weight_sharing'. - err = ParseParameter( - params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Weight sharing is ") + - (enable_weight_sharing_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update - // is made to 'enable_jit_profiling'. - bool enable_jit_profiling = false; - err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - enable_jit_profiling_pair_ = {true, enable_jit_profiling}; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Jit profiling is ") + - (enable_jit_profiling ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is - // made to 'enable_jit_executor'. - bool enable_jit_executor = false; - err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - enable_jit_executor_pair_ = {true, enable_jit_executor}; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Jit executor is ") + - (enable_jit_executor ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update - // is made to 'intra_op_thread_count', which by default will take all - // threads - int intra_op_thread_count = -1; - err = - ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - if (intra_op_thread_count > 0) { - // at::set_num_threads() does not throw if called more than once, but - // issues warnings. std::call_once() is useful to limit these. - std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() { - at::set_num_threads(intra_op_thread_count); - }); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Intra op thread count is set to ") + - std::to_string(at::get_num_threads()) + " for model instance '" + - Name() + "'") - .c_str()); - } - } - - // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update - // is made to 'inter_op_thread_count', which by default will take all - // threads - int inter_op_thread_count = -1; - err = - ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - if (inter_op_thread_count > 0) { - // at::set_num_interop_threads() throws if called more than once. - // std::call_once() should prevent this, but try/catch is additionally - // used for safety. - std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() { - try { - at::set_num_interop_threads(inter_op_thread_count); - } - catch (const c10::Error& e) { - // do nothing - } - }); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Inter op thread count is set to ") + - std::to_string(at::get_num_interop_threads()) + - " for model instance '" + Name() + "'") - .c_str()); - } - } - } - - return nullptr; -} - -// The naming convention followed for inputs/outputs in the model configuration. -// Outputs don't support FORWARD_ARGUMENT. -enum class NamingConvention { - NAMED_INDEX, - FORWARD_ARGUMENT, - STRICT_CONFIG_ORDERING -}; - -// -// ModelInstanceState -// -// State associated with a model instance. An object of this class is -// created and associated with each TRITONBACKEND_ModelInstance. -// -class ModelInstanceState : public BackendModelInstance { - public: - static TRITONSERVER_Error* Create( - ModelState* model_state, - TRITONBACKEND_ModelInstance* triton_model_instance, - ModelInstanceState** state); - virtual ~ModelInstanceState(); - - // Get the state of the model that corresponds to this instance. - ModelState* StateForModel() const { return model_state_; } - - // Execute... - void ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // Clear CUDA cache - void ClearCache(); - - private: - ModelInstanceState( - ModelState* model_state, - TRITONBACKEND_ModelInstance* triton_model_instance); - TRITONSERVER_Error* ValidateBooleanSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control); - TRITONSERVER_Error* ValidateTypedSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control); - TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt); - void AddInputToMap( - NamingConvention naming_convention, - const std::vector allowed_inputs, const std::string& io_name, - const uint32_t index); - TRITONSERVER_Error* ValidateOutputs(); - void Execute( - std::vector* responses, - const uint32_t response_count, - std::vector* input_tensors, - std::vector* output_tensors); - TRITONSERVER_Error* SetInputTensors( - size_t total_batch_size, TRITONBACKEND_Request** requests, - const uint32_t request_count, - std::vector* responses, - BackendInputCollector* collector, std::vector* input_names, - std::vector* input_tensors, bool* cuda_copy); - TRITONSERVER_Error* ReadOutputTensors( - size_t total_batch_size, - const std::vector& output_tensors, - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector* responses); - TRITONSERVER_Error* RecordBackendTimestamp( - uint64_t* timestamp, void* cuda_event); - - // Get the naming convention for inputs/outputs from the model configuration - TRITONSERVER_Error* GetNamingConvention( - NamingConvention* naming_convention, - const std::vector& allowed_io); - - // Create CUDA events for statistics collection. - void CreateCudaEvents(const int32_t& device_id); - - // Get the appropriate CUDA stream for input and output handling based on the - // instance group type. - cudaStream_t GetCudaStreamByInstanceKind(); - - // Replace the default CUDA stream with the stream we created to ensure proper - // cuda stream synchronization. - void SetCurrentCudaStream( - const cudaStream_t& stream, const int32_t& device_id); - - // Get the elapsed time between two CUDA events. - float GetCudaEventElapsedTime( - const cudaEvent_t& start_event, const cudaEvent_t& end_event); - - ModelState* model_state_; - - // The full path to the TorchScript model file. - std::string model_path_; - - std::shared_ptr torch_model_; - torch::Device device_; - - // Map from configuration name for an input to the index of - // that input in the model. - std::unordered_map input_index_map_; - uint32_t batch_input_count_ = 0; - - // Map from configuration name for an output to the index of - // that output in the model. - std::unordered_map output_index_map_; - std::unordered_map output_dtype_map_; - - // If the input to the tensor is a dictionary of tensors. - bool is_dict_input_; - - // If the model supports batching. - bool supports_batching_; - - cudaEvent_t compute_input_start_event_; - cudaEvent_t compute_infer_start_event_; - cudaEvent_t compute_output_start_event_; - - // Store the cuda streams created for the 'KIND_MODEL' instance group. - std::vector stream_vec_; - - // The number of available devices. - int device_cnt_; -}; - -TRITONSERVER_Error* -ModelInstanceState::Create( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, - ModelInstanceState** state) -{ - try { - *state = new ModelInstanceState(model_state, triton_model_instance); - } - catch (const BackendModelInstanceException& ex) { - RETURN_ERROR_IF_TRUE( - ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, - std::string("unexpected nullptr in BackendModelInstanceException")); - RETURN_IF_ERROR(ex.err_); - } - - return nullptr; // success -} - -ModelInstanceState::ModelInstanceState( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) - : BackendModelInstance(model_state, triton_model_instance), - model_state_(model_state), device_(torch::kCPU), is_dict_input_(false), - device_cnt_(0) -{ - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { -#ifdef TRITON_ENABLE_GPU - device_ = torch::Device(torch::kCUDA, DeviceId()); - CreateCudaEvents(DeviceId()); -#endif - } - -#ifdef TRITON_ENABLE_GPU - device_cnt_ = torch::cuda::device_count(); -#endif - - THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel( - ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_)); - - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { -#ifdef TRITON_ENABLE_GPU - // Since we cannot determine the exact devices used by the model, we create - // a CUDA stream for every available device to ensure proper synchronization - // of CUDA streams. This approach may have implications when a timestamp is - // captured on a device that is not used by the model. Currently, this issue - // is addressed by synchronizing the CUDA streams before recording - // timestamps to prevent timestamp skewing. However, in the future, any - // modifications to the CUDA stream synchronization logic should be handled - // with caution. - for (int i = 0; i < device_cnt_; i++) { - cudaStream_t stream; - THROW_IF_BACKEND_INSTANCE_ERROR( - CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream)); - stream_vec_.push_back(stream); - } - if (!stream_vec_.empty()) { - // Create CUDA events on the first device that will be used for collecting - // inputs/outputs. - CreateCudaEvents(0); - } -#endif - } - - size_t expected_input_cnt = 0; - { - triton::common::TritonJson::Value inputs; - if (model_state->ModelConfig().Find("input", &inputs)) { - expected_input_cnt = inputs.ArraySize(); - } - - triton::common::TritonJson::Value config_batch_inputs; - if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) { - batch_input_count_ = config_batch_inputs.ArraySize(); - expected_input_cnt += batch_input_count_; - } - } - - // If this is a sequence model then make sure that the required - // inputs are present in the model and have the correct shape and - // datatype. - triton::common::TritonJson::Value sequence_batching; - if (model_state->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - bool have_start, have_end, have_ready, have_corrid; - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_START", false /* required */, - &have_start)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_END", false /* required */, - &have_end)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */, - &have_ready)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */, - &have_corrid)); - if (have_start) { - expected_input_cnt += 1; - } - if (have_end) { - expected_input_cnt += 1; - } - if (have_ready) { - expected_input_cnt += 1; - } - if (have_corrid) { - expected_input_cnt += 1; - } - // Add the state inputs to the expected count - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - expected_input_cnt += states.ArraySize(); - } - } - supports_batching_ = model_state_->MaxBatchSize() > 0; - - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); -} - -void -ModelInstanceState::ClearCache() -{ -#ifdef TRITON_ENABLE_GPU - if (device_.is_cuda() || - ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { - c10::cuda::CUDACachingAllocator::emptyCache(); - } -#endif // TRITON_ENABLE_GPU -} - -ModelInstanceState::~ModelInstanceState() -{ - torch_model_.reset(); - ClearCache(); - - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { -#ifdef TRITON_ENABLE_GPU - for (size_t i = 0; i < stream_vec_.size(); i++) { - LOG_IF_ERROR( - ConvertCUDAStatusToTritonError( - cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL, - "Failed to set the device"), - "Failed to set the device"); - - LOG_IF_ERROR( - ConvertCUDAStatusToTritonError( - cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL, - "Failed to destroy cuda stream"), - "~ModelInstanceState error: "); - stream_vec_[i] = nullptr; - } -#endif - } -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateBooleanSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control) -{ - std::string tensor_name; - std::string tensor_datatype; - RETURN_IF_ERROR(GetBooleanSequenceControlProperties( - sequence_batching, model_state_->Name(), control_kind, required, - &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr, - nullptr, nullptr)); - *have_control = !tensor_name.empty(); - if (*have_control) { - std::string deliminator = "__"; - int ip_index = 0; - int start_pos = tensor_name.find(deliminator); - if (start_pos == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow __ naming convention.") - .c_str()); - } - - // check if the index part of the name is not an integer - std::string index_str = tensor_name.substr(start_pos + 2); - for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { - if (std::isdigit(*itr) == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow __ naming convention.") - .c_str()); - } - } - - ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); - input_index_map_[tensor_name] = ip_index; - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateTypedSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control) -{ - std::string tensor_name; - std::string tensor_datatype; - RETURN_IF_ERROR(GetTypedSequenceControlProperties( - sequence_batching, model_state_->Name(), control_kind, required, - &tensor_name, &tensor_datatype)); - *have_control = !tensor_name.empty(); - if (*have_control) { - std::string deliminator = "__"; - int ip_index = 0; - int start_pos = tensor_name.find(deliminator); - if (start_pos == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow __ naming convention.") - .c_str()); - } - - // check if the index part of the name is not an integer - std::string index_str = tensor_name.substr(start_pos + 2); - for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { - if (std::isdigit(*itr) == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow __ naming convention.") - .c_str()); - } - } - - // check if the data type is supported by PyTorch - if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + "' type '" + tensor_datatype + - "' is not supported by PyTorch.") - .c_str()); - } - - ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); - input_index_map_[tensor_name] = ip_index; - } - - return nullptr; // success -} - -void -ModelInstanceState::AddInputToMap( - NamingConvention naming_convention, - const std::vector allowed_inputs, const std::string& io_name, - const uint32_t index) -{ - std::string deliminator = "__"; - - if (is_dict_input_) { - // If dictionary, index is irrelevant but we use the map to store the - // input names since they are the keys for the dictionary - input_index_map_[io_name] = index; - } else { - switch (naming_convention) { - case NamingConvention::FORWARD_ARGUMENT: { - auto itr = - std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name); - if (itr != allowed_inputs.end()) { - input_index_map_[io_name] = - std::distance(allowed_inputs.begin(), itr); - } - return; - } - case NamingConvention::NAMED_INDEX: { - int start_pos = io_name.find(deliminator); - int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str()); - input_index_map_[io_name] = ip_index; - return; - } - case NamingConvention::STRICT_CONFIG_ORDERING: { - input_index_map_[io_name] = index; - return; - } - } - } -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) -{ - // Collect all the expected input tensor names and validate that the model - // configuration specifies only those. - std::vector allowed_inputs; - - const torch::jit::Method& method = torch_model_->get_method("forward"); - const auto& schema = method.function().getSchema(); - const std::vector& arguments = schema.arguments(); - - // Currently, only models with a single input of type Dict(str, Tensor) are - // supported. If the model expects more than one input then they must be all - // be of type Tensor. - // - // Ignore the argument at idx 0 if it is of Class type (self param in forward - // function) - size_t start_idx = 0; - if ((arguments.size() > 0) && - (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) { - start_idx = 1; - } - if ((arguments.size() == (1 + start_idx)) && - (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) { - is_dict_input_ = true; - } else if (arguments.size() > start_idx) { - // Return error if multiple inputs are of kind DictType - for (size_t i = start_idx + 1; i < arguments.size(); i++) { - if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "Multiple inputs of kind DictType were detected. Only a single " - "input of type Dict(str, Tensor) is supported."); - } - } - - // Return error if all inputs are not of type Tensor - for (size_t i = start_idx; i < arguments.size(); i++) { - if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) && - (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("An input of type '") + arguments.at(i).type()->str() + - "' was detected in the model. Only a single input of type " - "Dict(str, Tensor) or input(s) of type Tensor are supported.") - .c_str()); - } - allowed_inputs.emplace_back(arguments.at(i).name()); - } - - // If all inputs are tensors, match number of expected inputs between model - // and configuration - if ((arguments.size() - start_idx) != expected_input_cnt) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unable to load model '") + model_state_->Name() + - "', configuration expects " + std::to_string(expected_input_cnt) + - " inputs, model provides " + - std::to_string(arguments.size() - start_idx)) - .c_str()); - } - } - - triton::common::TritonJson::Value ios; - RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios)); - - if (ios.ArraySize() == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "model configuration must contain at least one input, none were " - "specified."); - } - - NamingConvention naming_convention; - RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs)); - - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - AddInputToMap(naming_convention, allowed_inputs, io_name, i); - // Validate data type - std::string io_dtype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); - const auto pr = ModelConfigDataTypeToTorchType(io_dtype); - if (!pr.first && (io_dtype != "TYPE_STRING")) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + io_dtype + " for input '" + io_name + - "' for model '" + model_state_->Name() + "'") - .c_str()); - } - - // Validate shape for String inputs. Only allow 1 dimension. - if (io_dtype == "TYPE_STRING") { - // If a reshape is provided for the input then use that when - // validating the model shapes. - std::vector dims; - triton::common::TritonJson::Value reshape; - if (io.Find("reshape", &reshape)) { - RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); - } else { - RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); - } - - if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Triton only supports 1 dimensional List of String as input for " - "'" + - std::string(io_name) + "' for model '" + model_state_->Name() + - "'") - .c_str()); - } - } - } - triton::common::TritonJson::Value sequence_batching; - if (model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - for (size_t i = 0; i < states.ArraySize(); i++) { - triton::common::TritonJson::Value state; - RETURN_IF_ERROR(states.IndexAsObject(i, &state)); - std::string state_name; - RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name)); - AddInputToMap(naming_convention, allowed_inputs, state_name, i); - - // Validate data type - std::string state_dtype; - RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); - const auto pr = ModelConfigDataTypeToTorchType(state_dtype); - if (!pr.first && (state_dtype != "TYPE_STRING")) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + state_dtype + " for input state '" + - state_name + "' for model '" + model_state_->Name() + "'") - .c_str()); - } - - // Validate shape for String inputs. Only allow 1 dimension. - if (state_dtype == "TYPE_STRING") { - std::vector dims; - if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Triton only supports 1 dimensional List of String as input " - "for " - "'" + - std::string(state_name) + "' for model '" + - model_state_->Name() + "'") - .c_str()); - } - } - } - } - } - - triton::common::TritonJson::Value batch_inputs; - RETURN_IF_ERROR( - model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs)); - size_t i = 0; - for (const auto& batch_input : StateForModel()->BatchInputs()) { - for (const auto& input_name : batch_input.TargetNames()) { - AddInputToMap( - naming_convention, allowed_inputs, input_name, i + ios.ArraySize()); - i++; - } - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateOutputs() -{ - triton::common::TritonJson::Value ios; - RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios)); - std::string deliminator = "__"; - int op_index = 0; - - if (ios.ArraySize() == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "model configuration must contain at least one output, none were " - "specified."); - } - - NamingConvention naming_convention; - RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {})); - - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - switch (naming_convention) { - case NamingConvention::NAMED_INDEX: { - int start_pos = io_name.find(deliminator); - op_index = std::atoi(io_name.substr(start_pos + 2).c_str()); - break; - } - case NamingConvention::STRICT_CONFIG_ORDERING: { - op_index = i; - break; - } - default: - break; - } - - // Validate data type - std::string io_dtype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); - const auto pr = ModelConfigDataTypeToTorchType(io_dtype); - if (!pr.first && (io_dtype != "TYPE_STRING")) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + io_dtype + " for output '" + io_name + - "' for model '" + model_state_->Name() + "'") - .c_str()); - } - - // Validate shape for String outputs. Only allow 1 dimension. - if (io_dtype == "TYPE_STRING") { - // If a reshape is provided for the output then use that when - // validating the model shapes. - std::vector dims; - triton::common::TritonJson::Value reshape; - if (io.Find("reshape", &reshape)) { - RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); - } else { - RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); - } - - if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Triton only supports 1 dimensional List of String as output for " - "'" + - std::string(io_name) + "' for model '" + model_state_->Name() + - "'") - .c_str()); - } - } - - output_index_map_[io_name] = op_index; - output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second); - } - - triton::common::TritonJson::Value sequence_batching; - if (model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - for (size_t i = 0; i < states.ArraySize(); i++) { - triton::common::TritonJson::Value state; - RETURN_IF_ERROR(states.IndexAsObject(i, &state)); - std::string state_name; - RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name)); - std::string state_dtype; - RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); - std::vector dims; - RETURN_IF_ERROR(ParseShape(state, "dims", &dims)); - - // For state, naming convention is enforced to be NAMED_INDEX - int start_pos = state_name.find(deliminator); - op_index = std::atoi(state_name.substr(start_pos + 2).c_str()); - - const auto pr = ModelConfigDataTypeToTorchType(state_dtype); - if (!pr.first && (state_dtype != "TYPE_STRING")) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + state_dtype + " for state '" + - state_name + "' for model '" + model_state_->Name() + "'") - .c_str()); - } - - // Validate shape for String outputs. Only allow 1 dimension. - if (state_dtype == "TYPE_STRING") { - if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Triton only supports 1 dimensional List of String as output " - "for " - "'" + - std::string(state_name) + "' for model '" + - model_state_->Name() + "'") - .c_str()); - } - } - - output_index_map_[state_name] = op_index; - output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second); - } - } - } - - return nullptr; // success -} - -void -ModelInstanceState::ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + - std::to_string(request_count) + " requests") - .c_str()); - -#ifdef TRITON_ENABLE_GPU - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { - SetCurrentCudaStream(stream_, DeviceId()); - } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { - // Replace the default stream of each device with the one we created. - for (size_t i = 0; i < stream_vec_.size(); i++) { - SetCurrentCudaStream(stream_vec_[i], i); - } - } -#endif - - NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); - - uint64_t exec_start_ns = 0; - SET_TIMESTAMP(exec_start_ns); - - const int max_batch_size = model_state_->MaxBatchSize(); - - // For each request collect the total batch size for this inference - // execution. The batch-size, number of inputs, and size of each - // input has already been checked so don't need to do that here. - size_t total_batch_size = 0; - for (size_t i = 0; i < request_count; i++) { - // If we get a nullptr request then something is badly wrong. Fail - // and release all requests. - if (requests[i] == nullptr) { - RequestsRespondWithError( - requests, request_count, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "null request given to PyTorch backend for '" + Name() + "'") - .c_str())); - return; - } - } - - // At this point we are committed to running inference with all - // 'requests'. Create a response for each request. During input - // processing if there is an error with any request that error will - // be sent immediately with the corresponding response (and the - // response unique_ptr will then be nullptr). The request object - // itself will not be released until after all inferencing is done - // (below) as we may need to access the request object when - // determine how to process outputs (for example, even if we don't - // need the outputs for a request that has an error, we do need to - // know the size of those outputs associated with the request so we - // can skip them in the output tensors). - std::vector responses; - responses.reserve(request_count); - bool all_response_failed = false; - - for (size_t i = 0; i < request_count; i++) { - TRITONBACKEND_Response* response; - auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); - if (err == nullptr) { - responses.emplace_back(response); - } else { - responses.emplace_back(nullptr); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); - TRITONSERVER_ErrorDelete(err); - } - } - - for (size_t i = 0; i < request_count; i++) { - if (max_batch_size > 0) { - // Retrieve the batch size from one of the inputs, if the model - // supports batching, the first dimension size is batch size. - TRITONBACKEND_Input* input; - TRITONSERVER_Error* err = - TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); - if (err == nullptr) { - const int64_t* shape; - err = TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); - total_batch_size += shape[0]; - } - if (err != nullptr) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, err); - } - } else { - total_batch_size += 1; - } - } - - // If there are no valid payloads then no need to run the inference. - if (total_batch_size == 0) { - return; - } - - // Make sure the maximum batch size is not exceeded. The - // total_batch_size must be 1 for models that don't support batching - // (i.e. max_batch_size == 0). If max_batch_size is exceeded then - // scheduler has done something badly wrong so fail and release all - // requests. - if (!all_response_failed) { - if ((total_batch_size != 1) && - (total_batch_size > (size_t)max_batch_size)) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "batch size " + std::to_string(total_batch_size) + " for '" + - Name() + "', max allowed is " + - std::to_string(max_batch_size)) - .c_str())); - } - } - - std::vector input_names; - std::vector input_tensors; - bool cuda_copy = false; - std::unique_ptr collector; - - // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute - // input duration since only one stream will be used for input collection. - if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || - ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { -#ifdef TRITON_ENABLE_GPU - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - ConvertCUDAStatusToTritonError( - cudaEventRecord( - compute_input_start_event_, GetCudaStreamByInstanceKind()), - TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); -#endif - } - - if (!all_response_failed) { - collector.reset(new BackendInputCollector( - requests, request_count, &responses, - model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(), - GetCudaStreamByInstanceKind(), nullptr, nullptr, 0, - HostPolicyName().c_str())); - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - SetInputTensors( - total_batch_size, requests, request_count, &responses, - collector.get(), &input_names, &input_tensors, &cuda_copy)); - } - -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(GetCudaStreamByInstanceKind()); - cuda_copy = false; - } -#endif - - std::vector output_tensors; - uint64_t compute_start_ns = 0; - uint64_t compute_infer_start = 0; - - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - RecordBackendTimestamp( - &compute_start_ns, - reinterpret_cast(&compute_infer_start_event_))); - - // For 'KIND_MODEL', capture the timestamp for the compute infer duration. - if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { - SET_TIMESTAMP(compute_infer_start); - } - - // Run... - if (!all_response_failed) { - Execute(&responses, request_count, &input_tensors, &output_tensors); - } - - // Verify output indices are valid with number of outputs after execution - bool invalid_index = false; - int max_index = output_tensors.size() - 1; - - if (!all_response_failed) { - for (const auto& name : model_state_->ModelOutputs()) { - int op_index = output_index_map_[name.first]; - if ((op_index < 0) || (op_index > max_index)) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - std::string( - "The output " + std::string(name.first) + - " in the model configuration refers to an output index " - "which doesn't exist. This model has " + - std::to_string(max_index + 1) + " outputs") - .c_str())); - invalid_index = true; - break; - } - } - } - -#ifdef TRITON_ENABLE_GPU - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { - // For 'KIND_MODEL', multiple streams will be involved, so we need to call - // 'cudaStreamSynchronize' before reading the output tensors. - for (auto& stream : stream_vec_) { - cudaStreamSynchronize(stream); - } - } -#endif - - uint64_t compute_end_ns = 0; - uint64_t compute_output_start = 0; - - if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { -#ifdef TRITON_ENABLE_GPU - SET_TIMESTAMP(compute_output_start); -#endif - } else { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - RecordBackendTimestamp( - &compute_end_ns, - reinterpret_cast(&compute_output_start_event_))); - } - - if (!all_response_failed) { - if (!invalid_index) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - ReadOutputTensors( - total_batch_size, output_tensors, requests, request_count, - &responses)); - } - } - - uint64_t exec_end_ns = 0; - SET_TIMESTAMP(exec_end_ns); - - // Send all the responses that haven't already been sent because of - // an earlier error. Note that the responses are not set to nullptr - // here as we need that indication below to determine if the request - // we successful or not. - for (auto& response : responses) { - if (response != nullptr) { - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), - "failed to send PyTorch backend response"); - } - } - - // We don't need an explicit CUDA syncrhonization here since we have already - // synchronized the stream in the ReadOutputTensors function. - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { -#ifdef TRITON_ENABLE_GPU - float compute_input_duration = GetCudaEventElapsedTime( - compute_input_start_event_, compute_infer_start_event_); - float compute_infer_duration = GetCudaEventElapsedTime( - compute_infer_start_event_, compute_output_start_event_); - - compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); - compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); -#endif - } else if ( - (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { -#ifdef TRITON_ENABLE_GPU - float compute_input_duration = GetCudaEventElapsedTime( - compute_input_start_event_, compute_infer_start_event_); - uint64_t compute_infer_duration = - compute_output_start - compute_infer_start; - - compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); - compute_end_ns = compute_start_ns + compute_infer_duration; -#endif - } - - // Report statistics for each request. - for (uint32_t r = 0; r < request_count; ++r) { - auto& request = requests[r]; - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportStatistics( - TritonModelInstance(), request, - (responses[r] != nullptr) /* success */, exec_start_ns, - compute_start_ns, compute_end_ns, exec_end_ns), - "failed reporting request statistics"); - - LOG_IF_ERROR( - TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), - "failed releasing request"); - } - - if (!all_response_failed) { - // Report the entire batch statistics. - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportBatchStatistics( - TritonModelInstance(), total_batch_size, exec_start_ns, - compute_start_ns, compute_end_ns, exec_end_ns), - "failed reporting batch request statistics"); - } -} - -void -ModelInstanceState::Execute( - std::vector* responses, - const uint32_t response_count, - std::vector* input_tensors, - std::vector* output_tensors) -{ - NVTX_RANGE(nvtx_, "Execute " + Name()); - - torch::jit::IValue model_outputs_; - - try { - // enable/disable optimized execution - torch::jit::setGraphExecutorOptimize( - model_state_->EnabledOptimizedExecution()); - - // enable/disable inference mode - supersedes NoGradGuard - torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); - - // enable/disable cudnn - at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn()); - - // JIT. No change is made unless parameter is explicitly set. - if (std::get<0>(model_state_->EnabledJitProfiling())) { - torch::jit::getProfilingMode() = - std::get<1>(model_state_->EnabledJitProfiling()); - } - - if (std::get<0>(model_state_->EnabledJitExecutor())) { - torch::jit::getExecutorMode() = - std::get<1>(model_state_->EnabledJitExecutor()); - } - - // Fuser. No change is made unless fuser is explicitly set in - // parameters. - if (std::get<0>(model_state_->EnabledTensorExprFuser())) { - torch::jit::setTensorExprFuserEnabled( - std::get<1>(model_state_->EnabledTensorExprFuser())); - } - - torch::NoGradGuard no_grad; - - // If input is a dictionary, prepare dictionary from 'input_tensors'. - if (is_dict_input_) { - torch::Dict input_dict; - for (auto& input_index : input_index_map_) { - torch::jit::IValue ival = (*input_tensors)[input_index.second]; - input_dict.insert(input_index.first, ival.toTensor()); - } - std::vector input_dict_ivalue = {input_dict}; - model_outputs_ = torch_model_->forward(input_dict_ivalue); - } else { - model_outputs_ = torch_model_->forward(*input_tensors); - } - - if (model_outputs_.isTuple()) { - auto model_outputs_tuple = model_outputs_.toTuple(); - size_t op_index = 0; - for (auto& m_op : model_outputs_tuple->elements()) { - if (m_op.isList()) { - auto list_output = m_op.toList(); - if (list_output.elementType()->kind() != c10::TypeKind::StringType) { - throw std::invalid_argument( - "output at index " + std::to_string(op_index) + - " must be of type Tensor or List[str], received List[" + - list_output.elementType()->str() + "]"); - } - output_tensors->push_back(m_op); - } else { - auto tensor_output = m_op.toTensor(); - output_tensors->push_back(m_op); - } - op_index++; - } - } else if (model_outputs_.isTensor()) { - output_tensors->push_back(model_outputs_); - } else if (model_outputs_.isList()) { - auto list_output = model_outputs_.toList(); - if (list_output.elementType()->kind() != c10::TypeKind::StringType) { - throw std::invalid_argument( - "output must be of type Tensor or List[str], received List[" + - list_output.elementType()->str() + "]"); - } - output_tensors->push_back(model_outputs_); - } else { - throw std::invalid_argument( - "output must be of type Tensor, List[str] or Tuple containing one of " - "these two types. It should not be a List / Dictionary of Tensors or " - "a Scalar"); - } - } - catch (std::exception& ex) { - SendErrorForResponses( - responses, response_count, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("PyTorch execute failure: " + std::string(ex.what())).c_str())); - } -} - -TRITONSERVER_Error* -ModelInstanceState::GetNamingConvention( - NamingConvention* naming_convention, - const std::vector& allowed_ios) -{ - // Rules for (non-Dictionary) input tensor names: - // 1. Must be in 'allowed_inputs' (arguments in the forward function) - // 2. Must follow the naming convention i.e. __ - // 3. If neither of the above conditions are satisfied, enforce strict - // ordering of model inputs. - // - // Rules for output tensor names: - // 1. Must follow the naming convention i.e. __ - // 2. If not, we enforce strict ordering of model outputs. - std::string deliminator = "__"; - std::string io_kind = "input"; - *naming_convention = NamingConvention::FORWARD_ARGUMENT; - - // symbolizes output - if (allowed_ios.size() == 0) { - io_kind = "output"; - *naming_convention = NamingConvention::NAMED_INDEX; - } - - triton::common::TritonJson::Value ios; - RETURN_IF_ERROR( - model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios)); - - if (io_kind == "input") { - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name); - if (itr == allowed_ios.end()) { - *naming_convention = NamingConvention::NAMED_INDEX; - break; - } - } - } - - // If not, check if inputs follow INDEX - if (*naming_convention == NamingConvention::NAMED_INDEX) { - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - int start_pos = io_name.find(deliminator); - if (start_pos == -1) { - *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; - break; - } else { - // check if the index part of the name is not an integer - std::string index_str = io_name.substr(start_pos + 2); - bool is_int = true; - for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { - if (std::isdigit(*itr) == 0) { - is_int = false; - } - } - - if (!is_int) { - if (io_kind == "input") { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - ("input '" + io_name + - "' or previous input(s) are neither an input argument to the " - "model '" + - model_state_->Name() + - "' nor do they follow the __ naming convention. " - "Falling back to enforcing strict ordering from model " - "configuration.") - .c_str()); - } else { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - ("output '" + io_name + - "' or previous output(s) of the model '" + - model_state_->Name() + - "' do not follow the __ naming convention. " - "Falling back to enforcing strict ordering from model " - "configuration.") - .c_str()); - } - *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; - break; - } - } - } - } - - triton::common::TritonJson::Value sequence_batching; - if (model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - // If we need to manage state for the model, then we need to check - // the naming of the state adheres to both the input and output conventions - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - if (*naming_convention != NamingConvention::NAMED_INDEX) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - ("PyTorch model '" + model_state_->Name() + - "' is using sequence batching with state but not all inputs and " - "outputs follow the __ naming convention. ") - .c_str()); - } - } - - for (size_t i = 0; i < states.ArraySize(); i++) { - triton::common::TritonJson::Value state; - RETURN_IF_ERROR(states.IndexAsObject(i, &state)); - std::string name_entry = - io_kind == "input" ? "input_name" : "output_name"; - std::string state_name; - RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name)); - int start_pos = state_name.find(deliminator); - if (start_pos == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - ("PyTorch model '" + model_state_->Name() + - "' is using sequence batching with state but state '" + - state_name + - "' does not follow the __ naming convention. ") - .c_str()); - } else { - // check if the index part of the name is not an integer - std::string index_str = state_name.substr(start_pos + 2); - bool is_int = true; - for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { - if (std::isdigit(*itr) == 0) { - is_int = false; - } - } - if (!is_int) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - ("PyTorch model '" + model_state_->Name() + - "' is using sequence batching with state but state '" + - state_name + - "' does not follow the __ naming convention. ") - .c_str()); - } - } - } - } - - return nullptr; // success -} - -// This function will return a tensor's contents as a contiguous -// chunk in system memory. In some cases this will require copying the data. -// If that happens, 'contiguous_buffer' will be set to hold the contiguous -// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is -// conducted. The data copy can be avoided if the input is already in -// a contiguous chunk and the input is located in memory type and id -// specified. -TRITONSERVER_Error* -GetContiguousInputContent( - TRITONBACKEND_Input* rinput, const uint32_t buffer_count, - const char** content, size_t* content_byte_size, - std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy) -{ - *cuda_copy = false; - - // Check input buffers to see if data copy is necessary - size_t chunk_count = 0; - bool type_mismatch = false; - uint64_t total_byte_size = 0; - for (size_t idx = 0; idx < buffer_count; ++idx) { - TRITONSERVER_MemoryType src_memory_type; - int64_t src_memory_type_id; - size_t src_byte_size; - const void* src_ptr; - - RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( - rinput, idx, &src_ptr, &src_byte_size, &src_memory_type, - &src_memory_type_id)); - - if (src_ptr != nullptr) { - chunk_count++; - total_byte_size += src_byte_size; - type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU); - } - } - - if (chunk_count == 0) { - *content = nullptr; - *content_byte_size = 0; - } else if ((chunk_count == 1) && !type_mismatch) { - TRITONSERVER_MemoryType src_memory_type; - int64_t src_memory_type_id; - RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( - rinput, 0, (const void**)content, content_byte_size, &src_memory_type, - &src_memory_type_id)); - } else { - contiguous_buffer->resize(total_byte_size); - - size_t offset = 0; - for (size_t i = 0; i < chunk_count; i++) { - bool cuda_used; - TRITONSERVER_MemoryType src_memory_type; - int64_t src_memory_type_id; - size_t src_byte_size; - const void* src_ptr; - - RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( - rinput, i, &src_ptr, &src_byte_size, &src_memory_type, - &src_memory_type_id)); - RETURN_IF_ERROR(CopyBuffer( - "Contiguous input", src_memory_type, src_memory_type_id, - TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr, - contiguous_buffer->data() + offset, stream, &cuda_used)); - *cuda_copy |= cuda_used; - offset += src_byte_size; - } - - *content = contiguous_buffer->data(); - *content_byte_size = total_byte_size; - } - - return nullptr; // success -} - -void -FillStringTensor(torch::List* input_list, const size_t cnt) -{ - for (size_t c = 0; c < cnt; ++c) { - input_list->push_back(""); - } -} - -bool -SetStringInputTensor( - torch::List* input_list, TRITONBACKEND_Input* input, - const char* name, const uint32_t buffer_count, - const size_t request_element_cnt, TRITONBACKEND_Response** response, - cudaStream_t stream, const char* host_policy_name) -{ - bool cuda_copy = false; - - // For string data type, we always need to have the data on CPU so - // that we can read string length and construct the string - // properly. So if the request's input tensor is not in CPU need to - // copy it there. - const char* content = nullptr; - size_t content_byte_size = 0; - - std::vector contiguous_buffer; - auto err = GetContiguousInputContent( - input, buffer_count, &content, &content_byte_size, &contiguous_buffer, - stream, &cuda_copy); - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - FillStringTensor(input_list, request_element_cnt); - return cuda_copy; - } - -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream); - cuda_copy = false; - } -#endif // TRITON_ENABLE_GPU - - std::vector> str_list; - err = ValidateStringBuffer( - content, content_byte_size, request_element_cnt, name, &str_list); - // Set string values. - for (const auto& [addr, len] : str_list) { - input_list->push_back(std::string(addr, len)); - } - - size_t element_cnt = str_list.size(); - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - FillStringTensor(input_list, request_element_cnt - element_cnt); - } - return cuda_copy; -} - -bool -SetStringBuffer( - torch::List* tensor, TRITONBACKEND_Response** response, - TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state, - const size_t tensor_element_count, cudaStream_t stream, - std::string* serialized, bool state) -{ - bool cuda_copy = false; - - // Serialize the output tensor strings. Each string is serialized as - // a 4-byte length followed by the string itself with no - // null-terminator. - serialized->clear(); - for (size_t e = 0; e < tensor_element_count; ++e) { - std::string str = tensor->get(e).to(); - const char* cstr = str.c_str(); - size_t len = str.length(); - serialized->append(reinterpret_cast(&len), sizeof(uint32_t)); - if (len > 0) { - serialized->append(cstr, len); - } - } - - // Allocate a buffer large enough to hold the serialized tensor. - TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; - int64_t actual_memory_type_id = 0; - - TRITONSERVER_Error* err; - void* buffer; - - if (!state) { - auto err = TRITONBACKEND_OutputBuffer( - response_output, &buffer, serialized->size(), &actual_memory_type, - &actual_memory_type_id); - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - return cuda_copy; - } - } else { - auto err = TRITONBACKEND_StateBuffer( - response_state, &buffer, serialized->size(), &actual_memory_type, - &actual_memory_type_id); - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - return cuda_copy; - } - } - // Copy the serialized tensor into the allocated buffer. - bool cuda_used = false; - err = CopyBuffer( - "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */, - 0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id, - serialized->size(), reinterpret_cast(serialized->c_str()), - buffer, stream, &cuda_used); - cuda_copy |= cuda_used; - - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - return cuda_copy; - } - - if (state) { - RESPOND_AND_SET_NULL_IF_ERROR( - response, TRITONBACKEND_StateUpdate(response_state)); - } - - return cuda_copy; -} - - -bool -SetStringOutputBuffer( - torch::List* tensor, TRITONBACKEND_Response** response, - TRITONBACKEND_Output* response_output, const size_t tensor_element_count, - cudaStream_t stream, std::string* serialized) -{ - return SetStringBuffer( - tensor, response, response_output, nullptr /* response_state */, - tensor_element_count, stream, serialized, false /* state */); -} - -bool -SetStringStateBuffer( - torch::List* tensor, TRITONBACKEND_Response** response, - TRITONBACKEND_State* response_state, const size_t tensor_element_count, - cudaStream_t stream, std::string* serialized) -{ - return SetStringBuffer( - tensor, response, nullptr /* response_output */, response_state, - tensor_element_count, stream, serialized, true /* state */); -} - - -TRITONSERVER_Error* -ModelInstanceState::SetInputTensors( - size_t total_batch_size, TRITONBACKEND_Request** requests, - const uint32_t request_count, - std::vector* responses, - BackendInputCollector* collector, std::vector* input_names, - std::vector* input_tensors, bool* cuda_copy) -{ - // InferenceMode should be used to guard all tensors operations - torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); - - // All requests must have equally-sized input tensors so use any - // request as the representative for the input tensors. - uint32_t input_count; - RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count)); - - input_tensors->resize(input_count + batch_input_count_); - - // The inputs must be in contiguous CPU/GPU memory. - std::vector> alloc_perference; - if (device_.is_cpu()) { - alloc_perference = { - {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; - } else { - alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; - } - - for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) { - TRITONBACKEND_Input* input; - RETURN_IF_ERROR( - TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input)); - - const char* input_name; - TRITONSERVER_DataType input_datatype; - const int64_t* input_shape; - uint32_t input_dims_count; - RETURN_IF_ERROR(TRITONBACKEND_InputProperties( - input, &input_name, &input_datatype, &input_shape, &input_dims_count, - nullptr, nullptr)); - - input_names->emplace_back(input_name); - - // The shape for the entire input patch, - // [total_batch_size, ...] for non-ragged input and - // [total_element_count] for ragged input (non-nested tensor) - std::vector batchn_shape; - if (StateForModel()->IsInputRagged(input_name)) { - batchn_shape = std::vector{0}; - for (size_t idx = 0; idx < request_count; idx++) { - TRITONBACKEND_Input* input; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); - const int64_t* input_shape; - uint32_t input_dims_count; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &input_shape, - &input_dims_count, nullptr, nullptr)); - - int64_t element_cnt = 0; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - GetElementCount(input_shape, input_dims_count, &element_cnt)); - batchn_shape[0] += element_cnt; - } - } else { - batchn_shape = - std::vector(input_shape, input_shape + input_dims_count); - if (supports_batching_) { - batchn_shape[0] = total_batch_size; - } - } - - // The input must be in contiguous CPU/GPU memory. - std::vector> alloc_perference; - // For 'KIND_MODEL', input will always be in CPU as we don't have a way to - // query the input types. - if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) { - alloc_perference = { - {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; - } else { - alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; - } - - const char* input_buffer; - size_t batchn_byte_size; - TRITONSERVER_MemoryType memory_type; - int64_t memory_type_id; - RETURN_IF_ERROR(collector->ProcessTensor( - input_name, nullptr, 0, alloc_perference, &input_buffer, - &batchn_byte_size, &memory_type, &memory_type_id)); - - // Create Torch tensor - const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype); - torch::TensorOptions options{torch_dtype.second}; - auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU) - ? options.device(torch::kCUDA, device_.index()) - : options.device(torch::kCPU); - - if (input_datatype == TRITONSERVER_TYPE_BYTES) { - // Create the PyTorch list to hold the strings. - torch::List input_list; - input_list.reserve(batchn_shape[0]); - - for (size_t idx = 0; idx < request_count; idx++) { - TRITONBACKEND_Input* input; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); - const int64_t* shape; - uint32_t dims_count; - uint32_t buffer_count; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - TRITONBACKEND_InputPropertiesForHostPolicy( - input, HostPolicyName().c_str(), nullptr, nullptr, &shape, - &dims_count, nullptr, &buffer_count)); - - int64_t batch_element_cnt = 0; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - GetElementCount(shape, dims_count, &batch_element_cnt)); - - *cuda_copy |= SetStringInputTensor( - &input_list, input, input_name, buffer_count, batch_element_cnt, - &((*responses)[idx]), GetCudaStreamByInstanceKind(), - HostPolicyName().c_str()); - } - - (*input_tensors)[input_index_map_[input_name]] = input_list; - } else { - if (batchn_byte_size) { - // Remove constness to align with the signature of torch::from_blob() - torch::Tensor input_tensor = torch::from_blob( - const_cast(input_buffer), batchn_shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; - } else { - // torch:from_blob seems not working when the input size is 0 - // create zero-length inputs directly - torch::Tensor input_tensor = - torch::zeros(batchn_shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; - } - } - } - - for (const auto& batch_input : StateForModel()->BatchInputs()) { - std::vector shape; - collector->BatchInputShape(batch_input, &shape); - - for (const auto& input_name : batch_input.TargetNames()) { - input_names->emplace_back(input_name.c_str()); - - const char* dst_buffer; - size_t dst_buffer_byte_size; - TRITONSERVER_MemoryType dst_memory_type; - int64_t dst_memory_type_id; - - RESPOND_ALL_AND_SET_NULL_IF_ERROR( - (*responses), responses->size(), - collector->ProcessBatchInput( - batch_input, nullptr, 0, alloc_perference, &dst_buffer, - &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id)); - - const auto torch_dtype = - ConvertDataTypeToTorchType(batch_input.DataType()); - torch::TensorOptions options{torch_dtype.second}; - auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU) - ? options.device(torch::kCUDA, device_.index()) - : options.device(torch::kCPU); - - if (dst_buffer_byte_size) { - torch::Tensor input_tensor = torch::from_blob( - const_cast(dst_buffer), shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; - } else { - // special handle when input has zero size - torch::Tensor input_tensor = torch::zeros(shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; - } - } - } - - // Finalize... - *cuda_copy |= collector->Finalize(); - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::ReadOutputTensors( - size_t total_batch_size, - const std::vector& output_tensors, - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector* responses) -{ - NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name()); - - BackendOutputResponder responder( - requests, request_count, responses, model_state_->TritonMemoryManager(), - model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(), - GetCudaStreamByInstanceKind()); - - bool cuda_copy = false; - // The serialized string buffer must be valid until output copies are done - std::vector> string_buffer; - for (auto& output : model_state_->ModelOutputs()) { - int op_index = output_index_map_[output.first]; - auto name = output.first; - auto output_tensor_pair = output.second; - - if (output_tensors[op_index].isTensor()) { - torch::Tensor output_flat; - try { - output_flat = - output_tensors[op_index].toTensor().contiguous().flatten(); - } - catch (std::exception& ex) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("output tensor '") + name + "' is not found") - .c_str())); - } - - // Verify output datatype matches datatype from model config - TRITONSERVER_DataType output_dtype = - ConvertTorchTypeToDataType(output_flat.scalar_type()); - TRITONSERVER_DataType config_datatype = output_dtype_map_[name]; - if (config_datatype != output_dtype) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("configuration expects datatype TYPE_") + - TRITONSERVER_DataTypeString(config_datatype) + " for output '" + - name + "', model provides TYPE_" + - TRITONSERVER_DataTypeString(output_dtype)) - .c_str())); - } - - const char* output_buffer = - static_cast(output_flat.data_ptr()); - - // Output tensors may not reside on the same device as model - torch::Device tensor_device = output_flat.device(); - const auto memory_type = (tensor_device.type() == torch::kCPU) - ? TRITONSERVER_MEMORY_CPU - : TRITONSERVER_MEMORY_GPU; - const auto memory_id = - (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index(); - - // Batch output doesn't support string data type yet, as it is not trivial - // to parse string output - const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name); - if (batch_output == nullptr) { - // Get output shape - std::vector batchn_shape; - auto shape = output_tensors[op_index].toTensor().sizes(); - for (auto itr = shape.begin(); itr != shape.end(); itr++) { - batchn_shape.push_back(*itr); - } - - if (batchn_shape.size() == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("output '") + name + - "' is a scalar which is not supported.") - .c_str()); - } - if (output_tensor_pair.first != -1) { - responder.ProcessTensor( - name, output_dtype, batchn_shape, output_buffer, memory_type, - memory_id); - } - if (output_tensor_pair.second != -1) { - std::vector states; - states = responder.ProcessStateTensor( - name, output_dtype, batchn_shape, output_buffer, memory_type, - memory_id); - // Update the states - for (auto& state : states) { - RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state)); - } - } - - } else { - responder.ProcessBatchOutput( - name, *batch_output, output_buffer, memory_type, memory_id); - } - } else if (output_tensors[op_index].isList()) { - // Custom handling for string/bytes tensor... - torch::List output_list = - output_tensors[op_index].toList(); - - // Get output shape - std::vector batchn_shape{(int64_t)output_list.size()}; - - for (size_t idx = 0; idx < responses->size(); idx++) { - auto& request = requests[idx]; - auto& response = (*responses)[idx]; - - if (supports_batching_ != 0) { - TRITONBACKEND_Input* input; - TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input); - const int64_t* shape; - TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); - batchn_shape[0] = shape[0]; - } - - int64_t tensor_element_cnt = 0; - RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt)); - - // Only need an response tensor for requested outputs. - if (response != nullptr) { - if (output_tensor_pair.first != -1) { - TRITONBACKEND_Output* response_output; - RESPOND_AND_SET_NULL_IF_ERROR( - &response, TRITONBACKEND_ResponseOutput( - response, &response_output, name.c_str(), - TRITONSERVER_TYPE_BYTES, batchn_shape.data(), - batchn_shape.size())); - string_buffer.emplace_back(new std::string()); - cuda_copy |= SetStringOutputBuffer( - &output_list, &response, response_output, tensor_element_cnt, - GetCudaStreamByInstanceKind(), string_buffer.back().get()); - } - } - if (output_tensor_pair.second != -1) { - TRITONBACKEND_State* response_state; - RESPOND_AND_SET_NULL_IF_ERROR( - &response, TRITONBACKEND_StateNew( - &response_state, request, name.c_str(), - TRITONSERVER_TYPE_BYTES, batchn_shape.data(), - batchn_shape.size())); - - string_buffer.emplace_back(new std::string()); - cuda_copy |= SetStringStateBuffer( - &output_list, &response, response_state, tensor_element_cnt, - GetCudaStreamByInstanceKind(), string_buffer.back().get()); - } - } - } else { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("output '") + name + - "' must be of type Tensor or List[str].") - .c_str()); - } - } - - // Finalize and wait for any pending buffer copies. - cuda_copy |= responder.Finalize(); - -#ifdef TRITON_ENABLE_GPU - // We have to always synchronize the stream. This is to make sure that - // the events on the cuda stream are synchronized. Otherwise, the events - // are only guaranteed to be synchronized if the model provides the output - // on GPU. - cudaStreamSynchronize(GetCudaStreamByInstanceKind()); -#endif - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::RecordBackendTimestamp( - uint64_t* timestamp, void* cuda_event) -{ - if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || - ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { -#ifdef TRITON_ENABLE_GPU - cudaEvent_t* lcuda_event = reinterpret_cast(cuda_event); - RETURN_IF_ERROR(ConvertCUDAStatusToTritonError( - cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()), - TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); -#endif - } else { - SET_TIMESTAMP(*timestamp); - } - return nullptr; -} - -void -ModelInstanceState::CreateCudaEvents(const int32_t& device_id) -{ -#ifdef TRITON_ENABLE_GPU - // Need to set the CUDA context so that the context that events are - // created on match with contexts that events are recorded with. - THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( - cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, - "Failed to set the device")); - THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( - cudaEventCreate(&compute_input_start_event_), TRITONSERVER_ERROR_INTERNAL, - "Failed to create cuda event")); - THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( - cudaEventCreate(&compute_infer_start_event_), TRITONSERVER_ERROR_INTERNAL, - "Failed to create cuda event")); - THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( - cudaEventCreate(&compute_output_start_event_), - TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event")); -#endif -} - -cudaStream_t -ModelInstanceState::GetCudaStreamByInstanceKind() -{ -#ifdef TRITON_ENABLE_GPU - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { - return stream_; - } else if ( - (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && - !stream_vec_.empty()) { - return stream_vec_[0]; - } -#endif - return nullptr; -} - -void -ModelInstanceState::SetCurrentCudaStream( - const cudaStream_t& stream, const int& device_id) -{ -#ifdef TRITON_ENABLE_GPU - at::cuda::CUDAStream torch_stream = - at::cuda::getStreamFromExternal(stream, device_id); - // This function replaces the default stream with the stream we created. It - // is not necessary to change the current device to the desired device when - // replacing the default stream for that device. See the documentation here: - // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html - at::cuda::setCurrentCUDAStream(torch_stream); -#endif -} - -float -ModelInstanceState::GetCudaEventElapsedTime( - const cudaEvent_t& start_event, const cudaEvent_t& end_event) -{ - float duration = 0; -#ifdef TRITON_ENABLE_GPU - // [FIXME] in the case of cudaEventElapsedTime failure, should handle - // stats reporting more gracefully as the durations are inaccurate - LOG_IF_ERROR( - ConvertCUDAStatusToTritonError( - cudaEventElapsedTime(&duration, start_event, end_event), - TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), - "Failed to capture elapsed time"); -#endif - return duration; -} - -///////////// +namespace triton::backend::pytorch { extern "C" { @@ -2704,8 +219,8 @@ TRITONBACKEND_ModelInstanceExecute( } return nullptr; // success -} +}; } // extern "C" -}}} // namespace triton::backend::pytorch +} // namespace triton::backend::pytorch diff --git a/src/libtorch.hh b/src/libtorch.hh new file mode 100644 index 0000000..263c340 --- /dev/null +++ b/src/libtorch.hh @@ -0,0 +1,59 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "model_instance_state.hh" +#include "model_state.hh" +#include "naming_convention.hh" +#include "string_utilities.hh" + +// +// PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API. +// + +namespace triton::backend::pytorch { + +extern "C" { + +TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend); + +TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model); + +TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model); + +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize( + TRITONBACKEND_ModelInstance* instance); + +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize( + TRITONBACKEND_ModelInstance* instance); + +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( + TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, + const uint32_t request_count); + +} // extern "C" + + +} // namespace triton::backend::pytorch diff --git a/src/model_instance_state.cc b/src/model_instance_state.cc new file mode 100644 index 0000000..7cd5ee3 --- /dev/null +++ b/src/model_instance_state.cc @@ -0,0 +1,1632 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "model_instance_state.hh" +#include "string_utilities.hh" + +#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION +// Suppress warnings in torch headers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma warning(push, 0) +#include +#include // Torchvision header +#pragma warning(pop) +#pragma GCC diagnostic pop +#endif // TRITON_PYTORCH_ENABLE_TORCHVISION + +#ifdef TRITON_ENABLE_GPU +#include +#include +#include +#endif // TRITON_ENABLE_GPU + + +namespace triton::backend::pytorch { + +ModelInstanceState::ModelInstanceState( + ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) + : BackendModelInstance(model_state, triton_model_instance), + model_state_(model_state), device_(torch::kCPU), is_dict_input_(false), + device_cnt_(0) +{ + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + device_ = torch::Device(torch::kCUDA, DeviceId()); + CreateCudaEvents(DeviceId()); +#endif + } + +#ifdef TRITON_ENABLE_GPU + device_cnt_ = torch::cuda::device_count(); +#endif + + THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel( + ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_)); + + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { +#ifdef TRITON_ENABLE_GPU + // Since we cannot determine the exact devices used by the model, we create + // a CUDA stream for every available device to ensure proper synchronization + // of CUDA streams. This approach may have implications when a timestamp is + // captured on a device that is not used by the model. Currently, this issue + // is addressed by synchronizing the CUDA streams before recording + // timestamps to prevent timestamp skewing. However, in the future, any + // modifications to the CUDA stream synchronization logic should be handled + // with caution. + for (int i = 0; i < device_cnt_; i++) { + cudaStream_t stream; + THROW_IF_BACKEND_INSTANCE_ERROR( + CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream)); + stream_vec_.push_back(stream); + } + if (!stream_vec_.empty()) { + // Create CUDA events on the first device that will be used for collecting + // inputs/outputs. + CreateCudaEvents(0); + } +#endif + } + + size_t expected_input_cnt = 0; + { + triton::common::TritonJson::Value inputs; + if (model_state->ModelConfig().Find("input", &inputs)) { + expected_input_cnt = inputs.ArraySize(); + } + + triton::common::TritonJson::Value config_batch_inputs; + if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) { + batch_input_count_ = config_batch_inputs.ArraySize(); + expected_input_cnt += batch_input_count_; + } + } + + // If this is a sequence model then make sure that the required + // inputs are present in the model and have the correct shape and + // datatype. + triton::common::TritonJson::Value sequence_batching; + if (model_state->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + bool have_start, have_end, have_ready, have_corrid; + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_START", false /* required */, + &have_start)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_END", false /* required */, + &have_end)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */, + &have_ready)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */, + &have_corrid)); + if (have_start) { + expected_input_cnt += 1; + } + if (have_end) { + expected_input_cnt += 1; + } + if (have_ready) { + expected_input_cnt += 1; + } + if (have_corrid) { + expected_input_cnt += 1; + } + // Add the state inputs to the expected count + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + expected_input_cnt += states.ArraySize(); + } + } + supports_batching_ = model_state_->MaxBatchSize() > 0; + + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); +} + +ModelInstanceState::~ModelInstanceState() +{ + torch_model_.reset(); + ClearCache(); + + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { +#ifdef TRITON_ENABLE_GPU + for (size_t i = 0; i < stream_vec_.size(); i++) { + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL, + "Failed to set the device"), + "Failed to set the device"); + + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL, + "Failed to destroy cuda stream"), + "~ModelInstanceState error: "); + stream_vec_[i] = nullptr; + } +#endif + } +} + +void +ModelInstanceState::AddInputToMap( + NamingConvention naming_convention, + const std::vector allowed_inputs, const std::string& io_name, + const uint32_t index) +{ + std::string deliminator = "__"; + + if (is_dict_input_) { + // If dictionary, index is irrelevant but we use the map to store the + // input names since they are the keys for the dictionary + input_index_map_[io_name] = index; + } else { + switch (naming_convention) { + case NamingConvention::FORWARD_ARGUMENT: { + auto itr = + std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name); + if (itr != allowed_inputs.end()) { + input_index_map_[io_name] = + std::distance(allowed_inputs.begin(), itr); + } + return; + } + case NamingConvention::NAMED_INDEX: { + int start_pos = io_name.find(deliminator); + int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str()); + input_index_map_[io_name] = ip_index; + return; + } + case NamingConvention::STRICT_CONFIG_ORDERING: { + input_index_map_[io_name] = index; + return; + } + } + } +} + +void +ModelInstanceState::ClearCache() +{ +#ifdef TRITON_ENABLE_GPU + if (device_.is_cuda() || + ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { + c10::cuda::CUDACachingAllocator::emptyCache(); + } +#endif // TRITON_ENABLE_GPU +} + +TRITONSERVER_Error* +ModelInstanceState::Create( + ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state) +{ + try { + *state = new ModelInstanceState(model_state, triton_model_instance); + } + catch (const BackendModelInstanceException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelInstanceException")); + RETURN_IF_ERROR(ex.err_); + } + + return nullptr; // success +} + +void +ModelInstanceState::Execute( + std::vector* responses, + const uint32_t response_count, + std::vector* input_tensors, + std::vector* output_tensors) +{ + NVTX_RANGE(nvtx_, "Execute " + Name()); + + torch::jit::IValue model_outputs_; + + try { + // enable/disable optimized execution + torch::jit::setGraphExecutorOptimize( + model_state_->EnabledOptimizedExecution()); + + // enable/disable inference mode - supersedes NoGradGuard + torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); + + // enable/disable cudnn + at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn()); + + // JIT. No change is made unless parameter is explicitly set. + if (std::get<0>(model_state_->EnabledJitProfiling())) { + torch::jit::getProfilingMode() = + std::get<1>(model_state_->EnabledJitProfiling()); + } + + if (std::get<0>(model_state_->EnabledJitExecutor())) { + torch::jit::getExecutorMode() = + std::get<1>(model_state_->EnabledJitExecutor()); + } + + // Fuser. No change is made unless fuser is explicitly set in + // parameters. + if (std::get<0>(model_state_->EnabledTensorExprFuser())) { + torch::jit::setTensorExprFuserEnabled( + std::get<1>(model_state_->EnabledTensorExprFuser())); + } + + torch::NoGradGuard no_grad; + + // If input is a dictionary, prepare dictionary from 'input_tensors'. + if (is_dict_input_) { + torch::Dict input_dict; + for (auto& input_index : input_index_map_) { + torch::jit::IValue ival = (*input_tensors)[input_index.second]; + input_dict.insert(input_index.first, ival.toTensor()); + } + std::vector input_dict_ivalue = {input_dict}; + model_outputs_ = torch_model_->forward(input_dict_ivalue); + } else { + model_outputs_ = torch_model_->forward(*input_tensors); + } + + if (model_outputs_.isTuple()) { + auto model_outputs_tuple = model_outputs_.toTuple(); + size_t op_index = 0; + for (auto& m_op : model_outputs_tuple->elements()) { + if (m_op.isList()) { + auto list_output = m_op.toList(); + if (list_output.elementType()->kind() != c10::TypeKind::StringType) { + throw std::invalid_argument( + "output at index " + std::to_string(op_index) + + " must be of type Tensor or List[str], received List[" + + list_output.elementType()->str() + "]"); + } + output_tensors->push_back(m_op); + } else { + auto tensor_output = m_op.toTensor(); + output_tensors->push_back(m_op); + } + op_index++; + } + } else if (model_outputs_.isTensor()) { + output_tensors->push_back(model_outputs_); + } else if (model_outputs_.isList()) { + auto list_output = model_outputs_.toList(); + if (list_output.elementType()->kind() != c10::TypeKind::StringType) { + throw std::invalid_argument( + "output must be of type Tensor or List[str], received List[" + + list_output.elementType()->str() + "]"); + } + output_tensors->push_back(model_outputs_); + } else { + throw std::invalid_argument( + "output must be of type Tensor, List[str] or Tuple containing one of " + "these two types. It should not be a List / Dictionary of Tensors or " + "a Scalar"); + } + } + catch (std::exception& ex) { + SendErrorForResponses( + responses, response_count, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("PyTorch execute failure: " + std::string(ex.what())).c_str())); + } +} + +float +ModelInstanceState::GetCudaEventElapsedTime( + const cudaEvent_t& start_event, const cudaEvent_t& end_event) +{ + float duration = 0; +#ifdef TRITON_ENABLE_GPU + // [FIXME] in the case of cudaEventElapsedTime failure, should handle + // stats reporting more gracefully as the durations are inaccurate + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaEventElapsedTime(&duration, start_event, end_event), + TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), + "Failed to capture elapsed time"); +#endif + return duration; +} + + +cudaStream_t +ModelInstanceState::GetCudaStreamByInstanceKind() +{ +#ifdef TRITON_ENABLE_GPU + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { + return stream_; + } else if ( + (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && + !stream_vec_.empty()) { + return stream_vec_[0]; + } +#endif + return nullptr; +} + +TRITONSERVER_Error* +ModelInstanceState::GetNamingConvention( + NamingConvention* naming_convention, + const std::vector& allowed_ios) +{ + // Rules for (non-Dictionary) input tensor names: + // 1. Must be in 'allowed_inputs' (arguments in the forward function) + // 2. Must follow the naming convention i.e. __ + // 3. If neither of the above conditions are satisfied, enforce strict + // ordering of model inputs. + // + // Rules for output tensor names: + // 1. Must follow the naming convention i.e. __ + // 2. If not, we enforce strict ordering of model outputs. + std::string deliminator = "__"; + std::string io_kind = "input"; + *naming_convention = NamingConvention::FORWARD_ARGUMENT; + + // symbolizes output + if (allowed_ios.size() == 0) { + io_kind = "output"; + *naming_convention = NamingConvention::NAMED_INDEX; + } + + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR( + model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios)); + + if (io_kind == "input") { + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name); + if (itr == allowed_ios.end()) { + *naming_convention = NamingConvention::NAMED_INDEX; + break; + } + } + } + + // If not, check if inputs follow INDEX + if (*naming_convention == NamingConvention::NAMED_INDEX) { + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + int start_pos = io_name.find(deliminator); + if (start_pos == -1) { + *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; + break; + } else { + // check if the index part of the name is not an integer + std::string index_str = io_name.substr(start_pos + 2); + bool is_int = true; + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + is_int = false; + } + } + + if (!is_int) { + if (io_kind == "input") { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + ("input '" + io_name + + "' or previous input(s) are neither an input argument to the " + "model '" + + model_state_->Name() + + "' nor do they follow the __ naming convention. " + "Falling back to enforcing strict ordering from model " + "configuration.") + .c_str()); + } else { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + ("output '" + io_name + + "' or previous output(s) of the model '" + + model_state_->Name() + + "' do not follow the __ naming convention. " + "Falling back to enforcing strict ordering from model " + "configuration.") + .c_str()); + } + *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; + break; + } + } + } + } + + triton::common::TritonJson::Value sequence_batching; + if (model_state_->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + // If we need to manage state for the model, then we need to check + // the naming of the state adheres to both the input and output conventions + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + if (*naming_convention != NamingConvention::NAMED_INDEX) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + ("PyTorch model '" + model_state_->Name() + + "' is using sequence batching with state but not all inputs and " + "outputs follow the __ naming convention. ") + .c_str()); + } + } + + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string name_entry = + io_kind == "input" ? "input_name" : "output_name"; + std::string state_name; + RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name)); + int start_pos = state_name.find(deliminator); + if (start_pos == -1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + ("PyTorch model '" + model_state_->Name() + + "' is using sequence batching with state but state '" + + state_name + + "' does not follow the __ naming convention. ") + .c_str()); + } else { + // check if the index part of the name is not an integer + std::string index_str = state_name.substr(start_pos + 2); + bool is_int = true; + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + is_int = false; + } + } + if (!is_int) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + ("PyTorch model '" + model_state_->Name() + + "' is using sequence batching with state but state '" + + state_name + + "' does not follow the __ naming convention. ") + .c_str()); + } + } + } + } + + return nullptr; // success +} + +void +ModelInstanceState::ProcessRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count) +{ + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + + std::to_string(request_count) + " requests") + .c_str()); + +#ifdef TRITON_ENABLE_GPU + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { + SetCurrentCudaStream(stream_, DeviceId()); + } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { + // Replace the default stream of each device with the one we created. + for (size_t i = 0; i < stream_vec_.size(); i++) { + SetCurrentCudaStream(stream_vec_[i], i); + } + } +#endif + + NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); + + uint64_t exec_start_ns = 0; + SET_TIMESTAMP(exec_start_ns); + + const int max_batch_size = model_state_->MaxBatchSize(); + + // For each request collect the total batch size for this inference + // execution. The batch-size, number of inputs, and size of each + // input has already been checked so don't need to do that here. + size_t total_batch_size = 0; + for (size_t i = 0; i < request_count; i++) { + // If we get a nullptr request then something is badly wrong. Fail + // and release all requests. + if (requests[i] == nullptr) { + RequestsRespondWithError( + requests, request_count, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "null request given to PyTorch backend for '" + Name() + "'") + .c_str())); + return; + } + } + + // At this point we are committed to running inference with all + // 'requests'. Create a response for each request. During input + // processing if there is an error with any request that error will + // be sent immediately with the corresponding response (and the + // response unique_ptr will then be nullptr). The request object + // itself will not be released until after all inferencing is done + // (below) as we may need to access the request object when + // determine how to process outputs (for example, even if we don't + // need the outputs for a request that has an error, we do need to + // know the size of those outputs associated with the request so we + // can skip them in the output tensors). + std::vector responses; + responses.reserve(request_count); + bool all_response_failed = false; + + for (size_t i = 0; i < request_count; i++) { + TRITONBACKEND_Response* response; + auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); + if (err == nullptr) { + responses.emplace_back(response); + } else { + responses.emplace_back(nullptr); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); + TRITONSERVER_ErrorDelete(err); + } + } + + for (size_t i = 0; i < request_count; i++) { + if (max_batch_size > 0) { + // Retrieve the batch size from one of the inputs, if the model + // supports batching, the first dimension size is batch size. + TRITONBACKEND_Input* input; + TRITONSERVER_Error* err = + TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); + if (err == nullptr) { + const int64_t* shape; + err = TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); + total_batch_size += shape[0]; + } + if (err != nullptr) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, err); + } + } else { + total_batch_size += 1; + } + } + + // If there are no valid payloads then no need to run the inference. + if (total_batch_size == 0) { + return; + } + + // Make sure the maximum batch size is not exceeded. The + // total_batch_size must be 1 for models that don't support batching + // (i.e. max_batch_size == 0). If max_batch_size is exceeded then + // scheduler has done something badly wrong so fail and release all + // requests. + if (!all_response_failed) { + if ((total_batch_size != 1) && + (total_batch_size > (size_t)max_batch_size)) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "batch size " + std::to_string(total_batch_size) + " for '" + + Name() + "', max allowed is " + + std::to_string(max_batch_size)) + .c_str())); + } + } + + std::vector input_names; + std::vector input_tensors; + bool cuda_copy = false; + std::unique_ptr collector; + + // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute + // input duration since only one stream will be used for input collection. + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || + ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { +#ifdef TRITON_ENABLE_GPU + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ConvertCUDAStatusToTritonError( + cudaEventRecord( + compute_input_start_event_, GetCudaStreamByInstanceKind()), + TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); +#endif + } + + if (!all_response_failed) { + collector.reset(new BackendInputCollector( + requests, request_count, &responses, + model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(), + GetCudaStreamByInstanceKind(), nullptr, nullptr, 0, + HostPolicyName().c_str())); + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + SetInputTensors( + total_batch_size, requests, request_count, &responses, + collector.get(), &input_names, &input_tensors, &cuda_copy)); + } + +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(GetCudaStreamByInstanceKind()); + cuda_copy = false; + } +#endif + + std::vector output_tensors; + uint64_t compute_start_ns = 0; + uint64_t compute_infer_start = 0; + + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_start_ns, + reinterpret_cast(&compute_infer_start_event_))); + + // For 'KIND_MODEL', capture the timestamp for the compute infer duration. + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { + SET_TIMESTAMP(compute_infer_start); + } + + // Run... + if (!all_response_failed) { + Execute(&responses, request_count, &input_tensors, &output_tensors); + } + + // Verify output indices are valid with number of outputs after execution + bool invalid_index = false; + int max_index = output_tensors.size() - 1; + + if (!all_response_failed) { + for (const auto& name : model_state_->ModelOutputs()) { + int op_index = output_index_map_[name.first]; + if ((op_index < 0) || (op_index > max_index)) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + std::string( + "The output " + std::string(name.first) + + " in the model configuration refers to an output index " + "which doesn't exist. This model has " + + std::to_string(max_index + 1) + " outputs") + .c_str())); + invalid_index = true; + break; + } + } + } + +#ifdef TRITON_ENABLE_GPU + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { + // For 'KIND_MODEL', multiple streams will be involved, so we need to call + // 'cudaStreamSynchronize' before reading the output tensors. + for (auto& stream : stream_vec_) { + cudaStreamSynchronize(stream); + } + } +#endif + + uint64_t compute_end_ns = 0; + uint64_t compute_output_start = 0; + + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { +#ifdef TRITON_ENABLE_GPU + SET_TIMESTAMP(compute_output_start); +#endif + } else { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_end_ns, + reinterpret_cast(&compute_output_start_event_))); + } + + if (!all_response_failed) { + if (!invalid_index) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ReadOutputTensors( + total_batch_size, output_tensors, requests, request_count, + &responses)); + } + } + + uint64_t exec_end_ns = 0; + SET_TIMESTAMP(exec_end_ns); + + // Send all the responses that haven't already been sent because of + // an earlier error. Note that the responses are not set to nullptr + // here as we need that indication below to determine if the request + // we successful or not. + for (auto& response : responses) { + if (response != nullptr) { + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), + "failed to send PyTorch backend response"); + } + } + + // We don't need an explicit CUDA syncrhonization here since we have already + // synchronized the stream in the ReadOutputTensors function. + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + float compute_input_duration = GetCudaEventElapsedTime( + compute_input_start_event_, compute_infer_start_event_); + float compute_infer_duration = GetCudaEventElapsedTime( + compute_infer_start_event_, compute_output_start_event_); + + compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); + compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); +#endif + } else if ( + (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { +#ifdef TRITON_ENABLE_GPU + float compute_input_duration = GetCudaEventElapsedTime( + compute_input_start_event_, compute_infer_start_event_); + uint64_t compute_infer_duration = + compute_output_start - compute_infer_start; + + compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); + compute_end_ns = compute_start_ns + compute_infer_duration; +#endif + } + + // Report statistics for each request. + for (uint32_t r = 0; r < request_count; ++r) { + auto& request = requests[r]; + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportStatistics( + TritonModelInstance(), request, + (responses[r] != nullptr) /* success */, exec_start_ns, + compute_start_ns, compute_end_ns, exec_end_ns), + "failed reporting request statistics"); + + LOG_IF_ERROR( + TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), + "failed releasing request"); + } + + if (!all_response_failed) { + // Report the entire batch statistics. + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportBatchStatistics( + TritonModelInstance(), total_batch_size, exec_start_ns, + compute_start_ns, compute_end_ns, exec_end_ns), + "failed reporting batch request statistics"); + } +} + +TRITONSERVER_Error* +ModelInstanceState::ReadOutputTensors( + size_t total_batch_size, + const std::vector& output_tensors, + TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector* responses) +{ + NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name()); + + BackendOutputResponder responder( + requests, request_count, responses, model_state_->TritonMemoryManager(), + model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(), + GetCudaStreamByInstanceKind()); + + bool cuda_copy = false; + // The serialized string buffer must be valid until output copies are done + std::vector> string_buffer; + for (auto& output : model_state_->ModelOutputs()) { + int op_index = output_index_map_[output.first]; + auto name = output.first; + auto output_tensor_pair = output.second; + + if (output_tensors[op_index].isTensor()) { + torch::Tensor output_flat; + try { + output_flat = + output_tensors[op_index].toTensor().contiguous().flatten(); + } + catch (std::exception& ex) { + RETURN_IF_ERROR(TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("output tensor '") + name + "' is not found") + .c_str())); + } + + // Verify output datatype matches datatype from model config + TRITONSERVER_DataType output_dtype = + ConvertTorchTypeToDataType(output_flat.scalar_type()); + TRITONSERVER_DataType config_datatype = output_dtype_map_[name]; + if (config_datatype != output_dtype) { + RETURN_IF_ERROR(TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("configuration expects datatype TYPE_") + + TRITONSERVER_DataTypeString(config_datatype) + " for output '" + + name + "', model provides TYPE_" + + TRITONSERVER_DataTypeString(output_dtype)) + .c_str())); + } + + const char* output_buffer = + static_cast(output_flat.data_ptr()); + + // Output tensors may not reside on the same device as model + torch::Device tensor_device = output_flat.device(); + const auto memory_type = (tensor_device.type() == torch::kCPU) + ? TRITONSERVER_MEMORY_CPU + : TRITONSERVER_MEMORY_GPU; + const auto memory_id = + (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index(); + + // Batch output doesn't support string data type yet, as it is not trivial + // to parse string output + const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name); + if (batch_output == nullptr) { + // Get output shape + std::vector batchn_shape; + auto shape = output_tensors[op_index].toTensor().sizes(); + for (auto itr = shape.begin(); itr != shape.end(); itr++) { + batchn_shape.push_back(*itr); + } + + if (batchn_shape.size() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("output '") + name + + "' is a scalar which is not supported.") + .c_str()); + } + if (output_tensor_pair.first != -1) { + responder.ProcessTensor( + name, output_dtype, batchn_shape, output_buffer, memory_type, + memory_id); + } + if (output_tensor_pair.second != -1) { + std::vector states; + states = responder.ProcessStateTensor( + name, output_dtype, batchn_shape, output_buffer, memory_type, + memory_id); + // Update the states + for (auto& state : states) { + RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state)); + } + } + + } else { + responder.ProcessBatchOutput( + name, *batch_output, output_buffer, memory_type, memory_id); + } + } else if (output_tensors[op_index].isList()) { + // Custom handling for string/bytes tensor... + torch::List output_list = + output_tensors[op_index].toList(); + + // Get output shape + std::vector batchn_shape{(int64_t)output_list.size()}; + + for (size_t idx = 0; idx < responses->size(); idx++) { + auto& request = requests[idx]; + auto& response = (*responses)[idx]; + + if (supports_batching_ != 0) { + TRITONBACKEND_Input* input; + TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input); + const int64_t* shape; + TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); + batchn_shape[0] = shape[0]; + } + + int64_t tensor_element_cnt = 0; + RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt)); + + // Only need an response tensor for requested outputs. + if (response != nullptr) { + if (output_tensor_pair.first != -1) { + TRITONBACKEND_Output* response_output; + RESPOND_AND_SET_NULL_IF_ERROR( + &response, TRITONBACKEND_ResponseOutput( + response, &response_output, name.c_str(), + TRITONSERVER_TYPE_BYTES, batchn_shape.data(), + batchn_shape.size())); + string_buffer.emplace_back(new std::string()); + cuda_copy |= SetStringOutputBuffer( + &output_list, &response, response_output, tensor_element_cnt, + GetCudaStreamByInstanceKind(), string_buffer.back().get()); + } + } + if (output_tensor_pair.second != -1) { + TRITONBACKEND_State* response_state; + RESPOND_AND_SET_NULL_IF_ERROR( + &response, TRITONBACKEND_StateNew( + &response_state, request, name.c_str(), + TRITONSERVER_TYPE_BYTES, batchn_shape.data(), + batchn_shape.size())); + + string_buffer.emplace_back(new std::string()); + cuda_copy |= SetStringStateBuffer( + &output_list, &response, response_state, tensor_element_cnt, + GetCudaStreamByInstanceKind(), string_buffer.back().get()); + } + } + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("output '") + name + + "' must be of type Tensor or List[str].") + .c_str()); + } + } + + // Finalize and wait for any pending buffer copies. + cuda_copy |= responder.Finalize(); + +#ifdef TRITON_ENABLE_GPU + // We have to always synchronize the stream. This is to make sure that + // the events on the cuda stream are synchronized. Otherwise, the events + // are only guaranteed to be synchronized if the model provides the output + // on GPU. + cudaStreamSynchronize(GetCudaStreamByInstanceKind()); +#endif + + return nullptr; +} + +TRITONSERVER_Error* +ModelInstanceState::RecordBackendTimestamp( + uint64_t* timestamp, void* cuda_event) +{ + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || + ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { +#ifdef TRITON_ENABLE_GPU + cudaEvent_t* lcuda_event = reinterpret_cast(cuda_event); + RETURN_IF_ERROR(ConvertCUDAStatusToTritonError( + cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()), + TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); +#endif + } else { + SET_TIMESTAMP(*timestamp); + } + return nullptr; +} + +void +ModelInstanceState::SetCurrentCudaStream( + const cudaStream_t& stream, const int& device_id) +{ +#ifdef TRITON_ENABLE_GPU + at::cuda::CUDAStream torch_stream = + at::cuda::getStreamFromExternal(stream, device_id); + // This function replaces the default stream with the stream we created. It + // is not necessary to change the current device to the desired device when + // replacing the default stream for that device. See the documentation here: + // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html + at::cuda::setCurrentCUDAStream(torch_stream); +#endif +} + +TRITONSERVER_Error* +ModelInstanceState::SetInputTensors( + size_t total_batch_size, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::vector* responses, + BackendInputCollector* collector, std::vector* input_names, + std::vector* input_tensors, bool* cuda_copy) +{ + // InferenceMode should be used to guard all tensors operations + torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); + + // All requests must have equally-sized input tensors so use any + // request as the representative for the input tensors. + uint32_t input_count; + RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count)); + + input_tensors->resize(input_count + batch_input_count_); + + // The inputs must be in contiguous CPU/GPU memory. + std::vector> alloc_perference; + if (device_.is_cpu()) { + alloc_perference = { + {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; + } else { + alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; + } + + for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) { + TRITONBACKEND_Input* input; + RETURN_IF_ERROR( + TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input)); + + const char* input_name; + TRITONSERVER_DataType input_datatype; + const int64_t* input_shape; + uint32_t input_dims_count; + RETURN_IF_ERROR(TRITONBACKEND_InputProperties( + input, &input_name, &input_datatype, &input_shape, &input_dims_count, + nullptr, nullptr)); + + input_names->emplace_back(input_name); + + // The shape for the entire input patch, + // [total_batch_size, ...] for non-ragged input and + // [total_element_count] for ragged input (non-nested tensor) + std::vector batchn_shape; + if (StateForModel()->IsInputRagged(input_name)) { + batchn_shape = std::vector{0}; + for (size_t idx = 0; idx < request_count; idx++) { + TRITONBACKEND_Input* input; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); + const int64_t* input_shape; + uint32_t input_dims_count; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &input_shape, + &input_dims_count, nullptr, nullptr)); + + int64_t element_cnt = 0; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + GetElementCount(input_shape, input_dims_count, &element_cnt)); + batchn_shape[0] += element_cnt; + } + } else { + batchn_shape = + std::vector(input_shape, input_shape + input_dims_count); + if (supports_batching_) { + batchn_shape[0] = total_batch_size; + } + } + + // The input must be in contiguous CPU/GPU memory. + std::vector> alloc_perference; + // For 'KIND_MODEL', input will always be in CPU as we don't have a way to + // query the input types. + if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) { + alloc_perference = { + {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; + } else { + alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; + } + + const char* input_buffer; + size_t batchn_byte_size; + TRITONSERVER_MemoryType memory_type; + int64_t memory_type_id; + RETURN_IF_ERROR(collector->ProcessTensor( + input_name, nullptr, 0, alloc_perference, &input_buffer, + &batchn_byte_size, &memory_type, &memory_type_id)); + + // Create Torch tensor + const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype); + torch::TensorOptions options{torch_dtype.second}; + auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU) + ? options.device(torch::kCUDA, device_.index()) + : options.device(torch::kCPU); + + if (input_datatype == TRITONSERVER_TYPE_BYTES) { + // Create the PyTorch list to hold the strings. + torch::List input_list; + input_list.reserve(batchn_shape[0]); + + for (size_t idx = 0; idx < request_count; idx++) { + TRITONBACKEND_Input* input; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); + const int64_t* shape; + uint32_t dims_count; + uint32_t buffer_count; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_InputPropertiesForHostPolicy( + input, HostPolicyName().c_str(), nullptr, nullptr, &shape, + &dims_count, nullptr, &buffer_count)); + + int64_t batch_element_cnt = 0; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + GetElementCount(shape, dims_count, &batch_element_cnt)); + + *cuda_copy |= SetStringInputTensor( + &input_list, input, input_name, buffer_count, batch_element_cnt, + &((*responses)[idx]), GetCudaStreamByInstanceKind(), + HostPolicyName().c_str()); + } + + (*input_tensors)[input_index_map_[input_name]] = input_list; + } else { + if (batchn_byte_size) { + // Remove constness to align with the signature of torch::from_blob() + torch::Tensor input_tensor = torch::from_blob( + const_cast(input_buffer), batchn_shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } else { + // torch:from_blob seems not working when the input size is 0 + // create zero-length inputs directly + torch::Tensor input_tensor = + torch::zeros(batchn_shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } + } + } + + for (const auto& batch_input : StateForModel()->BatchInputs()) { + std::vector shape; + collector->BatchInputShape(batch_input, &shape); + + for (const auto& input_name : batch_input.TargetNames()) { + input_names->emplace_back(input_name.c_str()); + + const char* dst_buffer; + size_t dst_buffer_byte_size; + TRITONSERVER_MemoryType dst_memory_type; + int64_t dst_memory_type_id; + + RESPOND_ALL_AND_SET_NULL_IF_ERROR( + (*responses), responses->size(), + collector->ProcessBatchInput( + batch_input, nullptr, 0, alloc_perference, &dst_buffer, + &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id)); + + const auto torch_dtype = + ConvertDataTypeToTorchType(batch_input.DataType()); + torch::TensorOptions options{torch_dtype.second}; + auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU) + ? options.device(torch::kCUDA, device_.index()) + : options.device(torch::kCPU); + + if (dst_buffer_byte_size) { + torch::Tensor input_tensor = torch::from_blob( + const_cast(dst_buffer), shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } else { + // special handle when input has zero size + torch::Tensor input_tensor = torch::zeros(shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } + } + } + + // Finalize... + *cuda_copy |= collector->Finalize(); + + return nullptr; +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateBooleanSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control) +{ + std::string tensor_name; + std::string tensor_datatype; + RETURN_IF_ERROR(GetBooleanSequenceControlProperties( + sequence_batching, model_state_->Name(), control_kind, required, + &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr)); + *have_control = !tensor_name.empty(); + if (*have_control) { + std::string deliminator = "__"; + int ip_index = 0; + int start_pos = tensor_name.find(deliminator); + if (start_pos == -1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + + // check if the index part of the name is not an integer + std::string index_str = tensor_name.substr(start_pos + 2); + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + } + + ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); + input_index_map_[tensor_name] = ip_index; + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) +{ + // Collect all the expected input tensor names and validate that the model + // configuration specifies only those. + std::vector allowed_inputs; + + const torch::jit::Method& method = torch_model_->get_method("forward"); + const auto& schema = method.function().getSchema(); + const std::vector& arguments = schema.arguments(); + + // Currently, only models with a single input of type Dict(str, Tensor) are + // supported. If the model expects more than one input then they must be all + // be of type Tensor. + // + // Ignore the argument at idx 0 if it is of Class type (self param in forward + // function) + size_t start_idx = 0; + if ((arguments.size() > 0) && + (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) { + start_idx = 1; + } + if ((arguments.size() == (1 + start_idx)) && + (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) { + is_dict_input_ = true; + } else if (arguments.size() > start_idx) { + // Return error if multiple inputs are of kind DictType + for (size_t i = start_idx + 1; i < arguments.size(); i++) { + if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "Multiple inputs of kind DictType were detected. Only a single " + "input of type Dict(str, Tensor) is supported."); + } + } + + // Return error if all inputs are not of type Tensor + for (size_t i = start_idx; i < arguments.size(); i++) { + if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) && + (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("An input of type '") + arguments.at(i).type()->str() + + "' was detected in the model. Only a single input of type " + "Dict(str, Tensor) or input(s) of type Tensor are supported.") + .c_str()); + } + allowed_inputs.emplace_back(arguments.at(i).name()); + } + + // If all inputs are tensors, match number of expected inputs between model + // and configuration + if ((arguments.size() - start_idx) != expected_input_cnt) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("unable to load model '") + model_state_->Name() + + "', configuration expects " + std::to_string(expected_input_cnt) + + " inputs, model provides " + + std::to_string(arguments.size() - start_idx)) + .c_str()); + } + } + + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios)); + + if (ios.ArraySize() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "model configuration must contain at least one input, none were " + "specified."); + } + + NamingConvention naming_convention; + RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs)); + + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + AddInputToMap(naming_convention, allowed_inputs, io_name, i); + // Validate data type + std::string io_dtype; + RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); + const auto pr = ModelConfigDataTypeToTorchType(io_dtype); + if (!pr.first && (io_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + io_dtype + " for input '" + io_name + + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String inputs. Only allow 1 dimension. + if (io_dtype == "TYPE_STRING") { + // If a reshape is provided for the input then use that when + // validating the model shapes. + std::vector dims; + triton::common::TritonJson::Value reshape; + if (io.Find("reshape", &reshape)) { + RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); + } else { + RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); + } + + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as input for " + "'" + + std::string(io_name) + "' for model '" + model_state_->Name() + + "'") + .c_str()); + } + } + } + triton::common::TritonJson::Value sequence_batching; + if (model_state_->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string state_name; + RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name)); + AddInputToMap(naming_convention, allowed_inputs, state_name, i); + + // Validate data type + std::string state_dtype; + RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); + const auto pr = ModelConfigDataTypeToTorchType(state_dtype); + if (!pr.first && (state_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + state_dtype + " for input state '" + + state_name + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String inputs. Only allow 1 dimension. + if (state_dtype == "TYPE_STRING") { + std::vector dims; + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as input " + "for " + "'" + + std::string(state_name) + "' for model '" + + model_state_->Name() + "'") + .c_str()); + } + } + } + } + } + + triton::common::TritonJson::Value batch_inputs; + RETURN_IF_ERROR( + model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs)); + size_t i = 0; + for (const auto& batch_input : StateForModel()->BatchInputs()) { + for (const auto& input_name : batch_input.TargetNames()) { + AddInputToMap( + naming_convention, allowed_inputs, input_name, i + ios.ArraySize()); + i++; + } + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateOutputs() +{ + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios)); + std::string deliminator = "__"; + int op_index = 0; + + if (ios.ArraySize() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "model configuration must contain at least one output, none were " + "specified."); + } + + NamingConvention naming_convention; + RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {})); + + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + switch (naming_convention) { + case NamingConvention::NAMED_INDEX: { + int start_pos = io_name.find(deliminator); + op_index = std::atoi(io_name.substr(start_pos + 2).c_str()); + break; + } + case NamingConvention::STRICT_CONFIG_ORDERING: { + op_index = i; + break; + } + default: + break; + } + + // Validate data type + std::string io_dtype; + RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); + const auto pr = ModelConfigDataTypeToTorchType(io_dtype); + if (!pr.first && (io_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + io_dtype + " for output '" + io_name + + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String outputs. Only allow 1 dimension. + if (io_dtype == "TYPE_STRING") { + // If a reshape is provided for the output then use that when + // validating the model shapes. + std::vector dims; + triton::common::TritonJson::Value reshape; + if (io.Find("reshape", &reshape)) { + RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); + } else { + RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); + } + + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as output for " + "'" + + std::string(io_name) + "' for model '" + model_state_->Name() + + "'") + .c_str()); + } + } + + output_index_map_[io_name] = op_index; + output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second); + } + + triton::common::TritonJson::Value sequence_batching; + if (model_state_->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string state_name; + RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name)); + std::string state_dtype; + RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); + std::vector dims; + RETURN_IF_ERROR(ParseShape(state, "dims", &dims)); + + // For state, naming convention is enforced to be NAMED_INDEX + int start_pos = state_name.find(deliminator); + op_index = std::atoi(state_name.substr(start_pos + 2).c_str()); + + const auto pr = ModelConfigDataTypeToTorchType(state_dtype); + if (!pr.first && (state_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + state_dtype + " for state '" + + state_name + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String outputs. Only allow 1 dimension. + if (state_dtype == "TYPE_STRING") { + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as output " + "for " + "'" + + std::string(state_name) + "' for model '" + + model_state_->Name() + "'") + .c_str()); + } + } + + output_index_map_[state_name] = op_index; + output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second); + } + } + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateTypedSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control) +{ + std::string tensor_name; + std::string tensor_datatype; + RETURN_IF_ERROR(GetTypedSequenceControlProperties( + sequence_batching, model_state_->Name(), control_kind, required, + &tensor_name, &tensor_datatype)); + *have_control = !tensor_name.empty(); + if (*have_control) { + std::string deliminator = "__"; + int ip_index = 0; + int start_pos = tensor_name.find(deliminator); + if (start_pos == -1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + + // check if the index part of the name is not an integer + std::string index_str = tensor_name.substr(start_pos + 2); + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + } + + // check if the data type is supported by PyTorch + if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + "' type '" + tensor_datatype + + "' is not supported by PyTorch.") + .c_str()); + } + + ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); + input_index_map_[tensor_name] = ip_index; + } + + return nullptr; // success +} + + +} // namespace triton::backend::pytorch diff --git a/src/model_instance_state.hh b/src/model_instance_state.hh new file mode 100644 index 0000000..b495510 --- /dev/null +++ b/src/model_instance_state.hh @@ -0,0 +1,178 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "libtorch_utils.h" +#include "model_state.hh" +#include "naming_convention.hh" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + + +namespace triton::backend::pytorch { + +// +// ModelInstanceState +// +// State associated with a model instance. An object of this class is +// created and associated with each TRITONBACKEND_ModelInstance. +// +class ModelInstanceState : public BackendModelInstance { + private: + ModelState* model_state_; + + // The full path to the TorchScript model file. + std::string model_path_; + + std::shared_ptr torch_model_; + torch::Device device_; + + // Map from configuration name for an input to the index of + // that input in the model. + std::unordered_map input_index_map_; + uint32_t batch_input_count_ = 0; + + // Map from configuration name for an output to the index of + // that output in the model. + std::unordered_map output_index_map_; + std::unordered_map output_dtype_map_; + + // If the input to the tensor is a dictionary of tensors. + bool is_dict_input_; + + // If the model supports batching. + bool supports_batching_; + + cudaEvent_t compute_input_start_event_; + cudaEvent_t compute_infer_start_event_; + cudaEvent_t compute_output_start_event_; + + // Store the cuda streams created for the 'KIND_MODEL' instance group. + std::vector stream_vec_; + + // The number of available devices. + int device_cnt_; + + public: + virtual ~ModelInstanceState(); + + // Clear CUDA cache + void ClearCache(); + + static TRITONSERVER_Error* Create( + ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state); + + // Execute... + void ProcessRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count); + + // Get the state of the model that corresponds to this instance. + ModelState* StateForModel() const; + + private: + ModelInstanceState( + ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance); + + void AddInputToMap( + NamingConvention naming_convention, + const std::vector allowed_inputs, const std::string& io_name, + const uint32_t index); + + // Create CUDA events for statistics collection. + void CreateCudaEvents(const int32_t& device_id); + + void Execute( + std::vector* responses, + const uint32_t response_count, + std::vector* input_tensors, + std::vector* output_tensors); + + // Get the elapsed time between two CUDA events. + float GetCudaEventElapsedTime( + const cudaEvent_t& start_event, const cudaEvent_t& end_event); + + // Get the appropriate CUDA stream for input and output handling based on + // the instance group type. + cudaStream_t GetCudaStreamByInstanceKind(); + + // Get the naming convention for inputs/outputs from the model configuration + TRITONSERVER_Error* GetNamingConvention( + NamingConvention* naming_convention, + const std::vector& allowed_io); + + TRITONSERVER_Error* ReadOutputTensors( + size_t total_batch_size, + const std::vector& output_tensors, + TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector* responses); + + TRITONSERVER_Error* RecordBackendTimestamp( + uint64_t* timestamp, void* cuda_event); + + // Replace the default CUDA stream with the stream we created to ensure + // proper cuda stream synchronization. + void SetCurrentCudaStream( + const cudaStream_t& stream, const int32_t& device_id); + + TRITONSERVER_Error* SetInputTensors( + size_t total_batch_size, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::vector* responses, + BackendInputCollector* collector, std::vector* input_names, + std::vector* input_tensors, bool* cuda_copy); + + TRITONSERVER_Error* ValidateBooleanSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control); + + TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt); + + TRITONSERVER_Error* ValidateOutputs(); + + TRITONSERVER_Error* ValidateTypedSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control); +}; + +} // namespace triton::backend::pytorch diff --git a/src/model_state.cc b/src/model_state.cc new file mode 100644 index 0000000..b007438 --- /dev/null +++ b/src/model_state.cc @@ -0,0 +1,495 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "model_state.hh" + +#include + + +namespace { +std::once_flag pytorch_interop_threads_flag; +std::once_flag pytorch_intraop_threads_flag; +} // namespace + +namespace triton::backend::pytorch { + +ModelState::ModelState(TRITONBACKEND_Model* triton_model) + : BackendModel(triton_model), enable_optimized_execution_(true), + enable_inference_mode_(true), enable_cudnn_(true), + enable_cache_cleaning_(false), enable_weight_sharing_(false), + enable_tensor_fuser_pair_({false, true}), + enable_jit_profiling_pair_({false, true}), + enable_jit_executor_pair_({false, true}) +{ +} + +TRITONSERVER_Error* +ModelState::AutoCompleteConfig() +{ + // Auto-complete configuration is not supported since PyTorch does not + // store/capture sufficient model metadata so just log error instead. + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("skipping model configuration auto-complete for '") + + Name() + "': not supported for pytorch backend") + .c_str()); + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) +{ + try { + *state = new ModelState(triton_model); + } + catch (const BackendModelException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelException")); + RETURN_IF_ERROR(ex.err_); + } + + // Auto-complete the configuration if requested... + bool auto_complete_config = false; + RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( + triton_model, &auto_complete_config)); + if (auto_complete_config) { + RETURN_IF_ERROR((*state)->AutoCompleteConfig()); + RETURN_IF_ERROR((*state)->SetModelConfig()); + } + + auto& model_outputs = (*state)->model_outputs_; + // Parse the output states in the model configuration + triton::common::TritonJson::Value sequence_batching; + if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) { + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string output_state_name; + RETURN_IF_ERROR( + state.MemberAsString("output_name", &output_state_name)); + auto it = model_outputs.find(output_state_name); + if (it == model_outputs.end()) { + model_outputs.insert({output_state_name, std::make_pair(-1, i)}); + } else { + it->second.second = i; + } + } + } + } + + // Parse the output names in the model configuration + triton::common::TritonJson::Value outputs; + RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs)); + for (size_t i = 0; i < outputs.ArraySize(); i++) { + triton::common::TritonJson::Value output; + THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output)); + + // Use names from ModelConfig by reference since the model + // config will persist longer than this inference execution. + std::string output_name; + THROW_IF_BACKEND_INSTANCE_ERROR( + output.MemberAsString("name", &output_name)); + + auto it = model_outputs.find(output_name); + if (it == model_outputs.end()) { + model_outputs.insert({output_name, std::make_pair(i, -1)}); + } else { + it->second.first = i; + } + } + + RETURN_IF_ERROR((*state)->ParseParameters()); + + return nullptr; // success +} + +bool +ModelState::EnabledCacheCleaning() +{ + return enable_cache_cleaning_; +} + +bool +ModelState::EnabledCudnn() +{ + return enable_cudnn_; +} + +bool +ModelState::EnabledInferenceMode() +{ + return enable_inference_mode_; +} + +const std::pair& +ModelState::EnabledJitExecutor() const +{ + return enable_jit_executor_pair_; +} + +const std::pair& +ModelState::EnabledJitProfiling() const +{ + return enable_jit_profiling_pair_; +} + +bool +ModelState::EnabledOptimizedExecution() +{ + return enable_optimized_execution_; +} + +const std::pair& +ModelState::EnabledTensorExprFuser() const +{ + return enable_tensor_fuser_pair_; +} + +bool +ModelState::EnabledWeightSharing() +{ + return enable_weight_sharing_; +} + +TRITONSERVER_Error* +ModelState::LoadModel( + const std::string& artifact_name, const torch::Device device, + std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, + std::shared_ptr* torch_model) +{ + // Find the TorchScript file that describes the model. If the model + // configuration doesn't have an explicit model file specified then + // use the default name ("model.pt"). + std::string cc_model_filename = artifact_name; + if (cc_model_filename.empty()) { + cc_model_filename = "model.pt"; + } + + *model_path = JoinPath( + {RepositoryPath(), std::to_string(Version()), cc_model_filename}); + + { + bool exists; + RETURN_IF_ERROR(FileExists(*model_path, &exists)); + RETURN_ERROR_IF_FALSE( + exists, TRITONSERVER_ERROR_UNAVAILABLE, + std::string("unable to find '") + *model_path + + "' for model instance '" + Name() + "'"); + } + + // If weight sharing is enabled, skip loading model if + // it is already available on the target device + std::pair device_pair; + if (enable_weight_sharing_) { + device_pair = std::make_pair(!device.is_cpu(), device.index()); + auto mit = torch_models_.find(device_pair); + if (mit != torch_models_.end()) { + *torch_model = mit->second; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Reusing TorchScript model for instance '") + Name() + + "'") + .c_str()); + return nullptr; // success + } + } + + // Serialize the torch model to string + std::string model_data_str; + RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str)); + + // InferenceMode should be used to guard all tensors operations including + // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html + torch::InferenceMode infer_guard(EnabledInferenceMode()); + + try { + std::istringstream model_stream(model_data_str); + if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { + // Load the model without selecting a device. + torch_model->reset( + new torch::jit::Module(torch::jit::load(model_stream))); + } else { + torch_model->reset( + new torch::jit::Module(torch::jit::load(model_stream, device))); + } + } + catch (const std::exception& ex) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("failed to load model '" + Name() + "': " + ex.what()).c_str()); + } + + if (enable_weight_sharing_) { + if (!((torch_models_.emplace(device_pair, *torch_model)).second)) { + std::string type = device.is_cpu() ? "CPU" : "GPU"; + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("Model already found on target ") + type + " device " + + "(id " + std::to_string(device.index()) + ") for '" + Name() + "'") + .c_str()); + } + } + + return nullptr; // success +} + +const std::map>& +ModelState::ModelOutputs() +{ + return model_outputs_; +} + +TRITONSERVER_Error* +ModelState::ParseParameters() +{ + triton::common::TritonJson::Value params; + bool status = model_config_.Find("parameters", ¶ms); + if (status) { + // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no + // update is made to 'enable_optimized_execution_'. + bool disable_optimized_execution = false; + TRITONSERVER_Error* err = ParseParameter( + params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + enable_optimized_execution_ = !disable_optimized_execution; + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Optimized execution is ") + + (enable_optimized_execution_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then + // no update is made to 'enable_cache_cleaning_'. + err = ParseParameter( + params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Cache Cleaning is ") + + (enable_cache_cleaning_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made + // to 'enable_inference_mode_'. + err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Inference Mode is ") + + (enable_inference_mode_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made + // to 'enable_cudnn_'. + bool disable_cudnn = false; + err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + enable_cudnn_ = !disable_cudnn; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no + // update is made to 'enable_tensor_fuser'. + bool enable_tensor_fuser = false; + err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + enable_tensor_fuser_pair_ = {true, enable_tensor_fuser}; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Tensor fuser is ") + + (enable_tensor_fuser ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no + // update is made to 'enable_weight_sharing'. + err = ParseParameter( + params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Weight sharing is ") + + (enable_weight_sharing_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update + // is made to 'enable_jit_profiling'. + bool enable_jit_profiling = false; + err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + enable_jit_profiling_pair_ = {true, enable_jit_profiling}; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Jit profiling is ") + + (enable_jit_profiling ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is + // made to 'enable_jit_executor'. + bool enable_jit_executor = false; + err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + enable_jit_executor_pair_ = {true, enable_jit_executor}; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Jit executor is ") + + (enable_jit_executor ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update + // is made to 'intra_op_thread_count', which by default will take all + // threads + int intra_op_thread_count = -1; + err = + ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + if (intra_op_thread_count > 0) { + // at::set_num_threads() does not throw if called more than once, but + // issues warnings. std::call_once() is useful to limit these. + std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() { + at::set_num_threads(intra_op_thread_count); + }); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Intra op thread count is set to ") + + std::to_string(at::get_num_threads()) + " for model instance '" + + Name() + "'") + .c_str()); + } + } + + // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update + // is made to 'inter_op_thread_count', which by default will take all + // threads + int inter_op_thread_count = -1; + err = + ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + if (inter_op_thread_count > 0) { + // at::set_num_interop_threads() throws if called more than once. + // std::call_once() should prevent this, but try/catch is additionally + // used for safety. + std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() { + try { + at::set_num_interop_threads(inter_op_thread_count); + } + catch (const c10::Error& e) { + // do nothing + } + }); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Inter op thread count is set to ") + + std::to_string(at::get_num_interop_threads()) + + " for model instance '" + Name() + "'") + .c_str()); + } + } + } + + return nullptr; +} + +} // namespace triton::backend::pytorch diff --git a/src/model_state.hh b/src/model_state.hh new file mode 100644 index 0000000..1a404b8 --- /dev/null +++ b/src/model_state.hh @@ -0,0 +1,131 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include +#include +#include + +#include "libtorch_utils.h" +#include "naming_convention.hh" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + +// for thread control +// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api +// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133 +#include + + +namespace triton::backend::pytorch { + +class ModelState : public triton::backend::BackendModel { + private: + // Flag to indicate whether optimized execution is enabled. Defaults to true. + bool enable_optimized_execution_; + + // Flag to indicate whether inference mode is enabled. Defaults to false. + bool enable_inference_mode_; + + // Flag to indicate whether cudnn is enabled. Defaults to true. + bool enable_cudnn_; + + // Flag to indicate whether cache cleaning after each run is enabled. + // Defaults to false. + bool enable_cache_cleaning_; + + // Flag to indicate whether weight sharing is enabled. Defaults to false. + bool enable_weight_sharing_; + + // Flag pairs to indicate if various JIT settings are set and + // enabled respectively. Defaults to (false, true). Default behavior + // is to do nothing if not explicitly set. + std::pair enable_tensor_fuser_pair_; + std::pair enable_jit_profiling_pair_; + std::pair enable_jit_executor_pair_; + + // Model mapping for shared TorchScript model across all instances on the + // same device. The key is a pair of isGPU and device index. + std::map< + std::pair, std::shared_ptr> + torch_models_; + + // model_outputs is a map that contains unique outputs that the model must + // provide. The first pair is the model output index and the second is + // the index in the model state, -1 is used if one is not required. + // In the model configuration, the output in the state configuration + // can have intersection with the outputs section of the model. If an output + // is specified both in the output section and state section, it indicates + // that the backend must return the output state to the client too. + std::map> model_outputs_; + + public: + virtual ~ModelState() = default; + + static TRITONSERVER_Error* Create( + TRITONBACKEND_Model* triton_model, ModelState** state); + + bool EnabledCacheCleaning(); + + bool EnabledCudnn(); + + bool EnabledInferenceMode(); + + const std::pair& EnabledJitExecutor() const; + + const std::pair& EnabledJitProfiling() const; + + bool EnabledOptimizedExecution(); + + const std::pair& EnabledTensorExprFuser() const; + + bool EnabledWeightSharing(); + + TRITONSERVER_Error* LoadModel( + const std::string& artifact_name, const torch::Device device, + std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, + std::shared_ptr* torch_model); + + const std::map>& ModelOutputs(); + + private: + ModelState(TRITONBACKEND_Model* triton_model); + + TRITONSERVER_Error* AutoCompleteConfig(); + + TRITONSERVER_Error* ParseParameters(); +}; + +} // namespace triton::backend::pytorch diff --git a/src/naming_convention.hh b/src/naming_convention.hh new file mode 100644 index 0000000..756cba4 --- /dev/null +++ b/src/naming_convention.hh @@ -0,0 +1,40 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + + +namespace triton::backend::pytorch { + +// The naming convention followed for inputs/outputs in the model configuration. +// Outputs don't support FORWARD_ARGUMENT. +enum class NamingConvention { + NAMED_INDEX, + FORWARD_ARGUMENT, + STRICT_CONFIG_ORDERING +}; + +} // namespace triton::backend::pytorch diff --git a/src/string_utils.cc b/src/string_utils.cc new file mode 100644 index 0000000..a605c7c --- /dev/null +++ b/src/string_utils.cc @@ -0,0 +1,254 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "string_utils.hh" + + +namespace triton::backend::pytorch { + +// This function will return a tensor's contents as a contiguous +// chunk in system memory. In some cases this will require copying the data. +// If that happens, 'contiguous_buffer' will be set to hold the contiguous +// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is +// conducted. The data copy can be avoided if the input is already in +// a contiguous chunk and the input is located in memory type and id +// specified. +TRITONSERVER_Error* +GetContiguousInputContent( + TRITONBACKEND_Input* rinput, const uint32_t buffer_count, + const char** content, size_t* content_byte_size, + std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy) +{ + *cuda_copy = false; + + // Check input buffers to see if data copy is necessary + size_t chunk_count = 0; + bool type_mismatch = false; + uint64_t total_byte_size = 0; + for (size_t idx = 0; idx < buffer_count; ++idx) { + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + size_t src_byte_size; + const void* src_ptr; + + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, idx, &src_ptr, &src_byte_size, &src_memory_type, + &src_memory_type_id)); + + if (src_ptr != nullptr) { + chunk_count++; + total_byte_size += src_byte_size; + type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU); + } + } + + if (chunk_count == 0) { + *content = nullptr; + *content_byte_size = 0; + } else if ((chunk_count == 1) && !type_mismatch) { + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, 0, (const void**)content, content_byte_size, &src_memory_type, + &src_memory_type_id)); + } else { + contiguous_buffer->resize(total_byte_size); + + size_t offset = 0; + for (size_t i = 0; i < chunk_count; i++) { + bool cuda_used; + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + size_t src_byte_size; + const void* src_ptr; + + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, i, &src_ptr, &src_byte_size, &src_memory_type, + &src_memory_type_id)); + RETURN_IF_ERROR(CopyBuffer( + "Contiguous input", src_memory_type, src_memory_type_id, + TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr, + contiguous_buffer->data() + offset, stream, &cuda_used)); + *cuda_copy |= cuda_used; + offset += src_byte_size; + } + + *content = contiguous_buffer->data(); + *content_byte_size = total_byte_size; + } + + return nullptr; // success +} + +void +FillStringTensor(torch::List* input_list, const size_t cnt) +{ + for (size_t c = 0; c < cnt; ++c) { + input_list->push_back(""); + } +} + +bool +SetStringBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state, + const size_t tensor_element_count, cudaStream_t stream, + std::string* serialized, bool state) +{ + bool cuda_copy = false; + + // Serialize the output tensor strings. Each string is serialized as + // a 4-byte length followed by the string itself with no + // null-terminator. + serialized->clear(); + for (size_t e = 0; e < tensor_element_count; ++e) { + std::string str = tensor->get(e).to(); + const char* cstr = str.c_str(); + size_t len = str.length(); + serialized->append(reinterpret_cast(&len), sizeof(uint32_t)); + if (len > 0) { + serialized->append(cstr, len); + } + } + + // Allocate a buffer large enough to hold the serialized tensor. + TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; + int64_t actual_memory_type_id = 0; + + TRITONSERVER_Error* err; + void* buffer; + + if (!state) { + auto err = TRITONBACKEND_OutputBuffer( + response_output, &buffer, serialized->size(), &actual_memory_type, + &actual_memory_type_id); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + } else { + auto err = TRITONBACKEND_StateBuffer( + response_state, &buffer, serialized->size(), &actual_memory_type, + &actual_memory_type_id); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + } + // Copy the serialized tensor into the allocated buffer. + bool cuda_used = false; + err = CopyBuffer( + "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */, + 0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id, + serialized->size(), reinterpret_cast(serialized->c_str()), + buffer, stream, &cuda_used); + cuda_copy |= cuda_used; + + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + + if (state) { + RESPOND_AND_SET_NULL_IF_ERROR( + response, TRITONBACKEND_StateUpdate(response_state)); + } + + return cuda_copy; +} + +bool +SetStringInputTensor( + torch::List* input_list, TRITONBACKEND_Input* input, + const char* name, const uint32_t buffer_count, + const size_t request_element_cnt, TRITONBACKEND_Response** response, + cudaStream_t stream, const char* host_policy_name) +{ + bool cuda_copy = false; + + // For string data type, we always need to have the data on CPU so + // that we can read string length and construct the string + // properly. So if the request's input tensor is not in CPU need to + // copy it there. + const char* content = nullptr; + size_t content_byte_size = 0; + + std::vector contiguous_buffer; + auto err = GetContiguousInputContent( + input, buffer_count, &content, &content_byte_size, &contiguous_buffer, + stream, &cuda_copy); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + FillStringTensor(input_list, request_element_cnt); + return cuda_copy; + } + +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream); + cuda_copy = false; + } +#endif // TRITON_ENABLE_GPU + + std::vector> str_list; + err = ValidateStringBuffer( + content, content_byte_size, request_element_cnt, name, &str_list); + // Set string values. + for (const auto& [addr, len] : str_list) { + input_list->push_back(std::string(addr, len)); + } + + size_t element_cnt = str_list.size(); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + FillStringTensor(input_list, request_element_cnt - element_cnt); + } + return cuda_copy; +} + +bool +SetStringOutputBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized) +{ + return SetStringBuffer( + tensor, response, response_output, nullptr /* response_state */, + tensor_element_count, stream, serialized, false /* state */); +} + +bool +SetStringStateBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_State* response_state, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized) +{ + return SetStringBuffer( + tensor, response, nullptr /* response_output */, response_state, + tensor_element_count, stream, serialized, true /* state */); +} + +} // namespace triton::backend::pytorch diff --git a/src/string_utils.hh b/src/string_utils.hh new file mode 100644 index 0000000..8373478 --- /dev/null +++ b/src/string_utils.hh @@ -0,0 +1,106 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include +#include +#include + +#include "libtorch_utils.h" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + +#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION +// Suppress warnings in torch headers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma warning(push, 0) +#include +#include // Torchvision header +#pragma warning(pop) +#pragma GCC diagnostic pop +#endif // TRITON_PYTORCH_ENABLE_TORCHVISION + +#ifdef TRITON_ENABLE_GPU +#include +#include +#include +#endif // TRITON_ENABLE_GPU + +// for thread control +// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api +// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133 +#include + + +namespace triton::backend::pytorch { + +void FillStringTensor(torch::List* input_list, const size_t cnt); + +// This function will return a tensor's contents as a contiguous +// chunk in system memory. In some cases this will require copying the data. +// If that happens, 'contiguous_buffer' will be set to hold the contiguous +// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is +// conducted. The data copy can be avoided if the input is already in +// a contiguous chunk and the input is located in memory type and id +// specified. +TRITONSERVER_Error* GetContiguousInputContent( + TRITONBACKEND_Input* rinput, const uint32_t buffer_count, + const char** content, size_t* content_byte_size, + std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy); + +bool SetStringBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state, + const size_t tensor_element_count, cudaStream_t stream, + std::string* serialized, bool state); + +bool SetStringInputTensor( + torch::List* input_list, TRITONBACKEND_Input* input, + const char* name, const uint32_t buffer_count, + const size_t request_element_cnt, TRITONBACKEND_Response** response, + cudaStream_t stream, const char* host_policy_name); + +bool SetStringOutputBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized); + +bool SetStringStateBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_State* response_state, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized); + +} // namespace triton::backend::pytorch From 7e9d0f9d29f5c87f36c5c5fd22d8bdd5e242c1b2 Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Wed, 5 Nov 2025 10:57:33 -0800 Subject: [PATCH 76/76] build(fix): Update header file reference (#170) * Update header file reference * fix: address pre-commit issue * Update header name --- src/libtorch.hh | 2 +- src/model_instance_state.cc | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/libtorch.hh b/src/libtorch.hh index 263c340..4bd4700 100644 --- a/src/libtorch.hh +++ b/src/libtorch.hh @@ -27,7 +27,7 @@ #include "model_instance_state.hh" #include "model_state.hh" #include "naming_convention.hh" -#include "string_utilities.hh" +#include "string_utils.hh" // // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API. diff --git a/src/model_instance_state.cc b/src/model_instance_state.cc index 7cd5ee3..19cae27 100644 --- a/src/model_instance_state.cc +++ b/src/model_instance_state.cc @@ -25,7 +25,8 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "model_instance_state.hh" -#include "string_utilities.hh" + +#include "string_utils.hh" #ifdef TRITON_PYTORCH_ENABLE_TORCHVISION // Suppress warnings in torch headers