From a4f3138f1b2734e3fd82def3c539e7a911b81650 Mon Sep 17 00:00:00 2001
From: Hemant Jain <hemantj@nvidia.com>
Date: Tue, 26 Apr 2022 08:30:40 -0700
Subject: [PATCH 01/76] Fix libc10 missing issue on jetson (#56) (#58)

---
 CMakeLists.txt | 124 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 77 insertions(+), 47 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 076b095..909148f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,60 +125,67 @@ endif() # TRITON_ENABLE_GPU
 #
 configure_file(src/libtriton_pytorch.ldscript libtriton_pytorch.ldscript COPYONLY)
 
-if (${TRITON_PYTORCH_DOCKER_BUILD})
-  if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
-    set(LIBS_ARCH "aarch64")
-    set(CONDA_LIBS
-        "libopenblas.so.0"
-    )
-  else()
-    set(LIBS_ARCH "x86_64")
-    set(CONDA_LIBS
-        "libmkl_core.so"
-        "libmkl_gnu_thread.so"
-        "libmkl_intel_lp64.so"
-        "libmkl_intel_thread.so"
-        "libmkl_def.so"
-        "libmkl_vml_def.so"
-        "libmkl_rt.so"
-        "libmkl_avx2.so"
-        "libmkl_avx512.so"
-        "libmkl_sequential.so"
-        "libomp.so"
-    )
-  endif()
+set(PT_LIBS
+    "libc10.so"
+    "libc10_cuda.so"
+    "libtorch.so"
+    "libtorch_cpu.so"
+    "libtorch_cuda.so"
+    "libtorch_global_deps.so"
+)
+
+if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
   set(PT_LIBS
-      ${CONDA_LIBS}
-      "libc10.so"
-      "libc10_cuda.so"
-      "libtorch.so"
-      "libtorch_cpu.so"
-      "libtorch_cuda.so"
-      "libtorch_global_deps.so"
+      ${PT_LIBS}
       "libtorchvision.so"
   )
-  set(OPENCV_LIBS
-      "libopencv_video.so"
-      "libopencv_videoio.so"
-      "libopencv_highgui.so"
-      "libopencv_imgcodecs.so"
-      "libopencv_imgproc.so"
-      "libopencv_core.so"
-      "libpng16.so"
+endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
+
+if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
+  set(PT_LIBS
+      ${PT_LIBS}
+      "libtorchtrt_runtime.so"
+  )
+endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
+
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(LIBS_ARCH "aarch64")
+  set(CONDA_LIBS
+      "libopenblas.so.0"
+  )
+else()
+  set(LIBS_ARCH "x86_64")
+  set(CONDA_LIBS
+      "libmkl_core.so"
+      "libmkl_gnu_thread.so"
+      "libmkl_intel_lp64.so"
+      "libmkl_intel_thread.so"
+      "libmkl_def.so"
+      "libmkl_vml_def.so"
+      "libmkl_rt.so"
+      "libmkl_avx2.so"
+      "libmkl_avx512.so"
+      "libmkl_sequential.so"
+      "libomp.so"
   )
+endif()
+set(OPENCV_LIBS
+    "libopencv_video.so"
+    "libopencv_videoio.so"
+    "libopencv_highgui.so"
+    "libopencv_imgcodecs.so"
+    "libopencv_imgproc.so"
+    "libopencv_core.so"
+    "libpng16.so"
+)
 
+if (${TRITON_PYTORCH_DOCKER_BUILD})
   string(REPLACE ";" " " CONDA_LIBS_STR "${CONDA_LIBS}")
 
-  if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
-    set(PT_LIBS
-        ${PT_LIBS}
-        "libtorchtrt_runtime.so"
-    )
-  endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
-
   add_custom_command(
     OUTPUT
       ${PT_LIBS}
+      ${CONDA_LIBS}
       ${OPENCV_LIBS}
       LICENSE.pytorch
       include/torch
@@ -218,7 +225,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}"
     VERBATIM
   )
-  add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${OPENCV_LIBS})
+  add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS})
   add_library(ptlib SHARED IMPORTED GLOBAL)
   add_dependencies(ptlib ptlib_target)
 
@@ -384,7 +391,7 @@ install(
 
 if (${TRITON_PYTORCH_DOCKER_BUILD})
   set(PT_LIB_PATHS "")
-  FOREACH(plib ${PT_LIBS} ${OPENCV_LIBS})
+  FOREACH(plib ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS})
     set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}")
   ENDFOREACH(plib)
 
@@ -403,7 +410,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     )
   endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
 
-  FOREACH(plib ${PT_LIBS} ${OPENCV_LIBS})
+  FOREACH(plib ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS})
     install(
       CODE
         "EXECUTE_PROCESS(
@@ -433,6 +440,29 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
         message(FATAL_ERROR \"FAILED: to create links\")
       endif()"
   )
+else()
+  FOREACH(plib ${PT_LIBS})
+    set(PT_LIB_PATHS ${PT_LIB_PATHS} "${TRITON_PYTORCH_LIB_PATHS}/${plib}")
+  ENDFOREACH(plib)
+
+  install(
+    FILES
+      ${PT_LIB_PATHS}
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch
+  )
+
+  FOREACH(plib ${PT_LIBS})
+    install(
+      CODE
+        "EXECUTE_PROCESS(
+          COMMAND patchelf --set-rpath \$ORIGIN ${plib}
+          RESULT_VARIABLE PATCHELF_STATUS
+          WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
+        if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0)
+          message(FATAL_ERROR \"FAILED: to run patchelf\")
+        endif()"
+    )
+  ENDFOREACH(plib)
 endif() # TRITON_PYTORCH_DOCKER_BUILD
 
 install(

From 510cc49df8012bee78839e976ff79a0909992cdc Mon Sep 17 00:00:00 2001
From: Hemant Jain <hemantj@nvidia.com>
Date: Wed, 4 May 2022 13:54:15 -0700
Subject: [PATCH 02/76] Fix typos + cleanup ReadMe (#62)

---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 0eb8388..96282ce 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ $ make install
 ```
 
 The following required Triton repositories will be pulled and used in
-the build. By default the "main" branch/tag will be used for each repo
+the build. By default, the "main" branch/tag will be used for each repo
 but the listed CMake argument can be used to override.
 
 * triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
@@ -100,10 +100,10 @@ $ make install
 ### Parameters
 
 Triton exposes some flags to control the execution mode of the TorchScript models through
-the Parameters section of the model's 'config.pbtxt' file.
+the Parameters section of the model's `config.pbtxt` file.
 
 * `DISABLE_OPTIMIZED_EXECUTION`: Boolean flag to disable the optimized execution
-of TorchScript models. By default the optimized execuiton is always enabled.
+of TorchScript models. By default, the optimized execution is always enabled.
 
 The initial calls to a loaded TorchScript model take extremely long. Due to this longer
 model warmup [issue](https://github.com/pytorch/pytorch/issues/57894), Triton also allows
@@ -117,13 +117,13 @@ The section of model config file specifying this parameter will look like:
 parameters: {
 key: "DISABLE_OPTIMIZED_EXECUTION"
     value: {
-    string_value:"true"
+    string_value: "true"
     }
 }
 ```
 
 * `INFERENCE_MODE`: Boolean flag to enable the Inference Mode execution
-of TorchScript models. By default the inference mode is disabled.
+of TorchScript models. By default, the inference mode is disabled.
 
 [InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new
 RAII guard analogous to NoGradMode to be used when you are certain your operations
@@ -139,14 +139,14 @@ The section of model config file specifying this parameter will look like:
 parameters: {
 key: "INFERENCE_MODE"
     value: {
-    string_value:"true"
+    string_value: "true"
     }
 }
 ```
 
 * `ENABLE_NVFUSER`: Boolean flag to enable the NvFuser (CUDA Graph
 Fuser) optimization for TorchScript models. If not specified, the
-default pytorch fuser is used. If `ENABLE_NVFUSER` is specified, the
+default PyTorch fuser is used. If `ENABLE_NVFUSER` is specified, the
 `ENABLE_TENSOR_FUSER` configuration (see below) is ignored.
 
 Please note that in some models generated using trace in old PyTorch versions might not work
@@ -159,7 +159,7 @@ The section of model config file specifying this parameter will look like:
 parameters: {
 key: "ENABLE_NVFUSER"
     value: {
-    string_value:"true"
+    string_value: "true"
     }
 }
 ```
@@ -174,7 +174,7 @@ The section of model config file specifying this parameter will look like:
 parameters: {
 key: "ENABLE_WEIGHT_SHARING"
     value: {
-    string_value:"true"
+    string_value: "true"
     }
 }
 ```
@@ -191,9 +191,9 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
 
 ### Important Note
 
-* The execution of pytorch model on GPU is asynchronous in nature. See
+* The execution of PyTorch model on GPU is asynchronous in nature. See
   [here](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution)
-  for more details. Consequently, an error in pytorch model execution may
+  for more details. Consequently, an error in PyTorch model execution may
   be raised during the next few inference requests to the server. Setting
   environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will
   help in correctly debugging failing cases by forcing synchronous execution.
@@ -201,8 +201,8 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
     state and a restart of the server may be required to continue serving
     successfully.
 
-* Multiple instances of the pytorch model on GPU do not always
-  increase performance. Due to thread specific caching in pytorch, using
+* Multiple instances of the PyTorch model on GPU do not always
+  increase performance. Due to thread specific caching in PyTorch, using
   multiple instances of the model interact negatively. See
   [here](https://github.com/pytorch/pytorch/issues/27902) for more details.
   Setting the parameter `DISABLE_OPTIMIZED_EXECUTION` to "true" in the model

From ab59a37ebecc3d921b9d7bc5a00fecf4defcc529 Mon Sep 17 00:00:00 2001
From: Hemant Jain <hemantj@nvidia.com>
Date: Mon, 9 May 2022 09:56:26 -0700
Subject: [PATCH 03/76] Add support for String input/output (#60)

* x

* Fix string input tensor support

* Fix string list creation

* Add support for String output

* cleanup

* review edits

* Readme fixes

* review edits
- throw error during model loading

* follow up edits

* Fix typo in check

* Use verbose checking of output type to ensure appropriate error message is returned during model execution

* cleanup

* Wrap raw pointer to allow auto freeing of memory

* follow up review edits
---
 README.md       |   8 +
 src/libtorch.cc | 531 +++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 471 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md
index 96282ce..4b41c1a 100644
--- a/README.md
+++ b/README.md
@@ -208,3 +208,11 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
   Setting the parameter `DISABLE_OPTIMIZED_EXECUTION` to "true" in the model
   configuration may help in some cases to avoid these negative interactions
   due to model specific caching and increase multiple instance performance.
+
+* PyTorch does not support Tensor of Strings but it does support models that accept
+a List of Strings as input(s) / produces a List of String as output(s). For these models
+Triton allows users to pass String input(s)/recieve String output(s) using the String
+datatype. As a limitation of using List instead of Tensor for String I/O, only for
+1-dimensional input(s)/output(s) are supported for I/O of String type.
+Batching is not allowed for PyTorch models with String I/O. For these models,
+the user must specify `max_batch_size: 0` in the configuration.
diff --git a/src/libtorch.cc b/src/libtorch.cc
index 6934a6c..e22b55a 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -461,7 +461,7 @@ class ModelInstanceState : public BackendModelInstance {
       std::vector<TRITONBACKEND_Response*>* responses,
       const uint32_t response_count,
       std::vector<torch::jit::IValue>* input_tensors,
-      std::vector<torch::Tensor>* output_tensors);
+      std::vector<torch::jit::IValue>* output_tensors);
   TRITONSERVER_Error* SetInputTensors(
       size_t total_batch_size, TRITONBACKEND_Request** requests,
       const uint32_t request_count,
@@ -471,7 +471,7 @@ class ModelInstanceState : public BackendModelInstance {
       std::vector<BackendMemory*>* input_memories, bool* cuda_copy);
   TRITONSERVER_Error* ReadOutputTensors(
       size_t total_batch_size, const std::vector<const char*>& output_names,
-      const std::vector<torch::Tensor>& output_tensors,
+      const std::vector<torch::jit::IValue>& output_tensors,
       TRITONBACKEND_Request** requests, const uint32_t request_count,
       std::vector<TRITONBACKEND_Response*>* responses,
       uint64_t* compute_end_ns);
@@ -689,7 +689,8 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
 
     // Return error if all inputs are not of type Tensor
     for (size_t i = start_idx; i < arguments.size(); i++) {
-      if (arguments.at(i).type()->kind() != c10::TypeKind::TensorType) {
+      if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) &&
+          (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) {
         return TRITONSERVER_ErrorNew(
             TRITONSERVER_ERROR_INTERNAL,
             (std::string("An input of type '") + arguments.at(i).type()->str() +
@@ -725,6 +726,8 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
         "specified.");
   }
 
+  bool supports_batching = model_state_->MaxBatchSize() > 0;
+
   for (size_t i = 0; i < ios.ArraySize(); i++) {
     triton::common::TritonJson::Value io;
     RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
@@ -766,13 +769,37 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
     std::string io_dtype;
     RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
     const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
-    if (!pr.first) {
+    if (!pr.first && (io_dtype != "TYPE_STRING")) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INTERNAL,
           ("unsupported datatype " + io_dtype + " for input '" + io_name +
            "' for model '" + model_state_->Name() + "'")
               .c_str());
     }
+
+    // Validate shape for String inputs. Only allow 1 dimension and no
+    // batching.
+    if (io_dtype == "TYPE_STRING") {
+      // If a reshape is provided for the input then use that when
+      // validating the model shapes.
+      std::vector<int64_t> dims;
+      triton::common::TritonJson::Value reshape;
+      if (io.Find("reshape", &reshape)) {
+        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
+      } else {
+        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
+      }
+
+      if ((dims.size() > 1) || supports_batching) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("Triton only supports 1 dimensional List of String as input for "
+             "'" +
+             std::string(io_name) + "' for model '" + model_state_->Name() +
+             "'")
+                .c_str());
+      }
+    }
   }
 
   return nullptr;  // success
@@ -793,6 +820,8 @@ ModelInstanceState::ValidateOutputs()
         "specified.");
   }
 
+  const bool supports_batching = model_state_->MaxBatchSize() > 0;
+
   for (size_t i = 0; i < ios.ArraySize(); i++) {
     triton::common::TritonJson::Value io;
     RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
@@ -819,13 +848,38 @@ ModelInstanceState::ValidateOutputs()
     std::string io_dtype;
     RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
     const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
-    if (!pr.first) {
+    if (!pr.first && (io_dtype != "TYPE_STRING")) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INTERNAL,
           ("unsupported datatype " + io_dtype + " for output '" + io_name +
            "' for model '" + model_state_->Name() + "'")
               .c_str());
     }
+
+    // Validate shape for String outputs. Only allow 1 dimension and no
+    // batching.
+    if (io_dtype == "TYPE_STRING") {
+      // If a reshape is provided for the output then use that when
+      // validating the model shapes.
+      std::vector<int64_t> dims;
+      triton::common::TritonJson::Value reshape;
+      if (io.Find("reshape", &reshape)) {
+        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
+      } else {
+        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
+      }
+
+      if ((dims.size() > 1) || supports_batching) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("Triton only supports 1 dimensional List of String as output for "
+             "'" +
+             std::string(io_name) + "' for model '" + model_state_->Name() +
+             "'")
+                .c_str());
+      }
+    }
+
     output_index_map_[io_name] = op_index;
     output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second);
   }
@@ -965,7 +1019,7 @@ ModelInstanceState::ProcessRequests(
   // 'output_tensors' are parallel vectors and so must be kept in
   // sync.
   std::vector<const char*> output_names;
-  std::vector<torch::Tensor> output_tensors;
+  std::vector<torch::jit::IValue> output_tensors;
   if (!all_response_failed) {
     triton::common::TritonJson::Value ios;
     TRITONSERVER_Error* err =
@@ -1036,8 +1090,7 @@ ModelInstanceState::ProcessRequests(
                 std::string(
                     "The output " + std::string(name) +
                     " in the model configuration refers to an output index "
-                    "which"
-                    " doesn't exist. This model has " +
+                    "which doesn't exist. This model has " +
                     std::to_string(max_index + 1) + " outputs")
                     .c_str()));
         invalid_index = true;
@@ -1104,7 +1157,7 @@ ModelInstanceState::Execute(
     std::vector<TRITONBACKEND_Response*>* responses,
     const uint32_t response_count,
     std::vector<torch::jit::IValue>* input_tensors,
-    std::vector<torch::Tensor>* output_tensors)
+    std::vector<torch::jit::IValue>* output_tensors)
 {
   torch::jit::IValue model_outputs_;
 
@@ -1169,20 +1222,38 @@ ModelInstanceState::Execute(
 
     if (model_outputs_.isTuple()) {
       auto model_outputs_tuple = model_outputs_.toTuple();
+      size_t op_index = 0;
       for (auto& m_op : model_outputs_tuple->elements()) {
-        output_tensors->push_back(m_op.toTensor());
-      }
-    } else {
-      try {
-        auto model_output_tensor = model_outputs_.toTensor();
-        output_tensors->push_back(model_output_tensor);
+        if (m_op.isList()) {
+          auto list_output = m_op.toList();
+          if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
+            throw std::invalid_argument(
+                "output at index " + std::to_string(op_index) +
+                " must be of type Tensor or List[str], recieved List[" +
+                list_output.elementType()->str() + "]");
+          }
+          output_tensors->push_back(m_op);
+        } else {
+          auto tensor_output = m_op.toTensor();
+          output_tensors->push_back(m_op);
+        }
+        op_index++;
       }
-      catch (std::exception& exx) {
+    } else if (model_outputs_.isTensor()) {
+      output_tensors->push_back(model_outputs_);
+    } else if (model_outputs_.isList()) {
+      auto list_output = model_outputs_.toList();
+      if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
         throw std::invalid_argument(
-            "Output of torch model should be tensor or a tuple of tensors, not "
-            "a list / dictionary of tensors or a scalar: " +
-            std::string(exx.what()));
+            "output must be of type Tensor or List[str], recieved List[" +
+            list_output.elementType()->str() + "]");
       }
+      output_tensors->push_back(model_outputs_);
+    } else {
+      throw std::invalid_argument(
+          "output must be of type Tensor, List[str] or Tuple "
+          "containing one of these two types. It should not be a List / "
+          "Dictionary of Tensors or a Scalar");
     }
   }
   catch (std::exception& ex) {
@@ -1194,6 +1265,244 @@ ModelInstanceState::Execute(
   }
 }
 
+// This function will return a tensor's contents as a contiguous
+// chunk in system memory. In some cases this will require copying the data.
+// If that  happens, 'contiguous_buffer' will be set to hold the contiguous
+// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
+// conducted.  The data copy can be avoided if the input is already in
+// a contiguous chunk and the input is located in memory type and id
+// specified.
+TRITONSERVER_Error*
+GetContiguousInputContent(
+    TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
+    const char** content, size_t* content_byte_size,
+    std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy)
+{
+  *cuda_copy = false;
+
+  // Check input buffers to see if data copy is necessary
+  size_t chunk_count = 0;
+  bool type_mismatch = false;
+  uint64_t total_byte_size = 0;
+  for (size_t idx = 0; idx < buffer_count; ++idx) {
+    TRITONSERVER_MemoryType src_memory_type;
+    int64_t src_memory_type_id;
+    size_t src_byte_size;
+    const void* src_ptr;
+
+    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+        rinput, idx, &src_ptr, &src_byte_size, &src_memory_type,
+        &src_memory_type_id));
+
+    if (src_ptr != nullptr) {
+      chunk_count++;
+      total_byte_size += src_byte_size;
+      type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU);
+    }
+  }
+
+  if (chunk_count == 0) {
+    *content = nullptr;
+    *content_byte_size = 0;
+  } else if ((chunk_count == 1) && !type_mismatch) {
+    TRITONSERVER_MemoryType src_memory_type;
+    int64_t src_memory_type_id;
+    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+        rinput, 0, (const void**)content, content_byte_size, &src_memory_type,
+        &src_memory_type_id));
+  } else {
+    contiguous_buffer->resize(total_byte_size);
+
+    size_t offset = 0;
+    for (size_t i = 0; i < chunk_count; i++) {
+      bool cuda_used;
+      TRITONSERVER_MemoryType src_memory_type;
+      int64_t src_memory_type_id;
+      size_t src_byte_size;
+      const void* src_ptr;
+
+      RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+          rinput, i, &src_ptr, &src_byte_size, &src_memory_type,
+          &src_memory_type_id));
+      RETURN_IF_ERROR(CopyBuffer(
+          "Contiguous input", src_memory_type, src_memory_type_id,
+          TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr,
+          contiguous_buffer->data() + offset, stream, &cuda_used));
+      *cuda_copy |= cuda_used;
+      offset += src_byte_size;
+    }
+
+    *content = contiguous_buffer->data();
+    *content_byte_size = total_byte_size;
+  }
+
+  return nullptr;  // success
+}
+
+void
+FillStringTensor(
+    torch::List<std::string>* input_list, const size_t idx, const size_t cnt)
+{
+  for (size_t c = 0; c < cnt; ++c) {
+    input_list->push_back("");
+  }
+}
+
+bool
+SetStringInputTensor(
+    torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
+    const char* name, const uint32_t buffer_count,
+    const size_t request_element_cnt, const size_t tensor_offset,
+    TRITONBACKEND_Response** response, cudaStream_t stream,
+    const char* host_policy_name)
+{
+  bool cuda_copy = false;
+  size_t element_idx = 0;
+
+  // For string data type, we always need to have the data on CPU so
+  // that we can read string length and construct the string
+  // properly. So if the request's input tensor is not in CPU need to
+  // copy it there.
+  const char* content = nullptr;
+  size_t content_byte_size = 0;
+
+  std::vector<char> contiguous_buffer;
+  auto err = GetContiguousInputContent(
+      input, buffer_count, &content, &content_byte_size, &contiguous_buffer,
+      stream, &cuda_copy);
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    FillStringTensor(
+        input_list, tensor_offset + element_idx,
+        request_element_cnt - element_idx);
+    return cuda_copy;
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(stream);
+    cuda_copy = false;
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  // Parse content and assign to 'tensor'. Each string in 'content'
+  // is a 4-byte length followed by the string itself with no
+  // null-terminator.
+  while (content_byte_size >= sizeof(uint32_t)) {
+    if (element_idx >= request_element_cnt) {
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          response,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              std::string(
+                  "unexpected number of string elements " +
+                  std::to_string(element_idx + 1) + " for inference input '" +
+                  name + "', expecting " + std::to_string(request_element_cnt))
+                  .c_str()));
+      FillStringTensor(
+          input_list, tensor_offset + element_idx,
+          request_element_cnt - element_idx);
+      return cuda_copy;
+    }
+
+    const uint32_t len = *(reinterpret_cast<const uint32_t*>(content));
+    content += sizeof(uint32_t);
+    content_byte_size -= sizeof(uint32_t);
+
+    if (content_byte_size < len) {
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          response,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              std::string(
+                  "incomplete string data for inference input '" +
+                  std::string(name) + "', expecting string of length " +
+                  std::to_string(len) + " but only " +
+                  std::to_string(content_byte_size) + " bytes available")
+                  .c_str()));
+      FillStringTensor(
+          input_list, tensor_offset + element_idx,
+          request_element_cnt - element_idx);
+      return cuda_copy;
+    }
+
+    // Set string value
+    input_list->push_back(std::string(content, len));
+
+    content += len;
+    content_byte_size -= len;
+    element_idx++;
+  }
+
+  if ((*response != nullptr) && (element_idx != request_element_cnt)) {
+    RESPOND_AND_SET_NULL_IF_ERROR(
+        response, TRITONSERVER_ErrorNew(
+                      TRITONSERVER_ERROR_INTERNAL,
+                      std::string(
+                          "expected " + std::to_string(request_element_cnt) +
+                          " strings for inference input '" + name + "', got " +
+                          std::to_string(element_idx))
+                          .c_str()));
+    FillStringTensor(
+        input_list, tensor_offset + element_idx,
+        request_element_cnt - element_idx);
+  }
+
+  return cuda_copy;
+}
+
+bool
+SetStringOutputBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
+    const size_t tensor_offset, cudaStream_t stream, std::string* serialized)
+{
+  bool cuda_copy = false;
+
+  // Serialize the output tensor strings. Each string is serialized as
+  // a 4-byte length followed by the string itself with no
+  // null-terminator.
+  serialized->clear();
+  for (size_t e = 0; e < tensor_element_count; ++e) {
+    std::string str = tensor->get(e).to<std::string>();
+    const char* cstr = str.c_str();
+    size_t len = str.length();
+    serialized->append(reinterpret_cast<const char*>(&len), sizeof(uint32_t));
+    if (len > 0) {
+      serialized->append(cstr, len);
+    }
+  }
+
+  // Allocate a buffer large enough to hold the serialized tensor.
+  TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
+  int64_t actual_memory_type_id = 0;
+
+  void* buffer;
+  auto err = TRITONBACKEND_OutputBuffer(
+      response_output, &buffer, serialized->size(), &actual_memory_type,
+      &actual_memory_type_id);
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    return cuda_copy;
+  }
+
+  // Copy the serialized tensor into the allocated buffer.
+  bool cuda_used = false;
+  err = CopyBuffer(
+      "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */,
+      0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id,
+      serialized->size(), reinterpret_cast<const void*>(serialized->c_str()),
+      buffer, stream, &cuda_used);
+  cuda_copy |= cuda_used;
+
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    return cuda_copy;
+  }
+
+  return cuda_copy;
+}
+
 TRITONSERVER_Error*
 ModelInstanceState::SetInputTensors(
     size_t total_batch_size, TRITONBACKEND_Request** requests,
@@ -1252,17 +1561,59 @@ ModelInstanceState::SetInputTensors(
         input_name, nullptr, 0, alloc_perference, &input_buffer,
         &batchn_byte_size, &memory_type, &memory_type_id));
 
-    // Create Torch tenor
+    // Create Torch tensor
     const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype);
     torch::TensorOptions options{torch_dtype.second};
     auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU)
                                ? options.device(torch::kCUDA, device_.index())
                                : options.device(torch::kCPU);
 
-    // Remove constness to align with the signature of torch::from_blob()
-    torch::Tensor input_tensor = torch::from_blob(
-        const_cast<char*>(input_buffer), batchn_shape, updated_options);
-    (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+
+    if (input_datatype == TRITONSERVER_TYPE_BYTES) {
+      if (batchn_shape.size() != 1) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL, ("Triton only supports 1 dimensional "
+                                          "List of string as input for '" +
+                                          std::string(input_name) + "'")
+                                             .c_str());
+      }
+
+      // Create the PyTorch list to hold the strings.
+      torch::List<std::string> input_list;
+      input_list.reserve(batchn_shape[0]);
+
+      size_t tensor_offset = 0;
+
+      for (size_t idx = 0; idx < request_count; idx++) {
+        TRITONBACKEND_Input* input;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
+        const int64_t* shape;
+        uint32_t dims_count;
+        uint32_t buffer_count;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            TRITONBACKEND_InputPropertiesForHostPolicy(
+                input, HostPolicyName().c_str(), nullptr, nullptr, &shape,
+                &dims_count, nullptr, &buffer_count));
+
+        const int64_t batch_element_cnt = GetElementCount(shape, dims_count);
+
+        *cuda_copy |= SetStringInputTensor(
+            &input_list, input, input_name, buffer_count, batch_element_cnt,
+            tensor_offset, &((*responses)[idx]), CudaStream(),
+            HostPolicyName().c_str());
+        tensor_offset += batch_element_cnt;
+      }
+
+      (*input_tensors)[input_index_map_[input_name]] = input_list;
+    } else {
+      // Remove constness to align with the signature of torch::from_blob()
+      torch::Tensor input_tensor = torch::from_blob(
+          const_cast<char*>(input_buffer), batchn_shape, updated_options);
+      (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+    }
   }
 
   // Finalize...
@@ -1274,7 +1625,7 @@ ModelInstanceState::SetInputTensors(
 TRITONSERVER_Error*
 ModelInstanceState::ReadOutputTensors(
     size_t total_batch_size, const std::vector<const char*>& output_names,
-    const std::vector<torch::Tensor>& output_tensors,
+    const std::vector<torch::jit::IValue>& output_tensors,
     TRITONBACKEND_Request** requests, const uint32_t request_count,
     std::vector<TRITONBACKEND_Response*>* responses, uint64_t* compute_end_ns)
 {
@@ -1284,62 +1635,106 @@ ModelInstanceState::ReadOutputTensors(
       CudaStream());
 
   bool cuda_copy = false;
-  std::vector<std::vector<char>> string_buffers;
+  // The serialized string buffer must be valid until output copies are done
+  std::vector<std::unique_ptr<std::string>> string_buffer;
   for (size_t idx = 0; idx < output_names.size(); idx++) {
     std::string name = output_names[idx];
     int op_index = output_index_map_[name];
-    torch::Tensor output_flat;
 
-    try {
-      output_flat = output_tensors[op_index].contiguous().flatten();
-    }
-    catch (std::exception& ex) {
-      RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("output tensor '") + name + "' is not found").c_str()));
-    }
+    if (output_tensors[op_index].isTensor()) {
+      torch::Tensor output_flat;
+      try {
+        output_flat =
+            output_tensors[op_index].toTensor().contiguous().flatten();
+      }
+      catch (std::exception& ex) {
+        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            (std::string("output tensor '") + name + "' is not found")
+                .c_str()));
+      }
 
-    // Verify output datatype matches datatype from model config
-    TRITONSERVER_DataType output_dtype =
-        ConvertTorchTypeToDataType(output_flat.scalar_type());
-    TRITONSERVER_DataType config_datatype = output_dtype_map_[name];
-    if (config_datatype != output_dtype) {
-      RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("configuration expects datatype TYPE_") +
-           TRITONSERVER_DataTypeString(config_datatype) + " for output '" +
-           name + "', model provides TYPE_" +
-           TRITONSERVER_DataTypeString(output_dtype))
-              .c_str()));
-    }
+      // Verify output datatype matches datatype from model config
+      TRITONSERVER_DataType output_dtype =
+          ConvertTorchTypeToDataType(output_flat.scalar_type());
+      TRITONSERVER_DataType config_datatype = output_dtype_map_[name];
+      if (config_datatype != output_dtype) {
+        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string("configuration expects datatype TYPE_") +
+             TRITONSERVER_DataTypeString(config_datatype) + " for output '" +
+             name + "', model provides TYPE_" +
+             TRITONSERVER_DataTypeString(output_dtype))
+                .c_str()));
+      }
 
-    const char* output_buffer =
-        static_cast<const char*>(output_flat.data_ptr());
+      const char* output_buffer =
+          static_cast<const char*>(output_flat.data_ptr());
 
-    // Output tensors may not reside on the same device as model
-    torch::Device tensor_device = output_flat.device();
+      // Output tensors may not reside on the same device as model
+      torch::Device tensor_device = output_flat.device();
 
-    //  Set output shape
-    std::vector<int64_t> batchn_shape;
-    auto shape = output_tensors[op_index].sizes();
-    for (auto itr = shape.begin(); itr != shape.end(); itr++) {
-      batchn_shape.push_back(*itr);
-    }
+      // Get output shape
+      std::vector<int64_t> batchn_shape;
+      auto shape = output_tensors[op_index].toTensor().sizes();
+      for (auto itr = shape.begin(); itr != shape.end(); itr++) {
+        batchn_shape.push_back(*itr);
+      }
 
-    if (batchn_shape.size() == 0) {
-      RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
+      if (batchn_shape.size() == 0) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string("output '") + name +
+             "' is a scalar which is not supported.")
+                .c_str());
+      }
+
+      responder.ProcessTensor(
+          name, output_dtype, batchn_shape, output_buffer,
+          (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU
+                                                : TRITONSERVER_MEMORY_GPU,
+          (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index());
+
+    } else if (output_tensors[op_index].isList()) {
+      // Custom handling for string/bytes tensor...
+
+      torch::List<torch::jit::IValue> output_list =
+          output_tensors[op_index].toList();
+
+      // Get output shape
+      std::vector<int64_t> batchn_shape{(int64_t)output_list.size()};
+
+      size_t tensor_offset = 0;
+
+      for (size_t idx = 0; idx < responses->size(); idx++) {
+        auto& response = (*responses)[idx];
+
+        const size_t tensor_element_cnt = GetElementCount(batchn_shape);
+
+        // Only need an response tensor for requested outputs.
+        if (response != nullptr) {
+          TRITONBACKEND_Output* response_output;
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &response, TRITONBACKEND_ResponseOutput(
+                             response, &response_output, name.c_str(),
+                             TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
+                             batchn_shape.size()));
+          string_buffer.emplace_back(new std::string());
+          cuda_copy |= SetStringOutputBuffer(
+              &output_list, &response, response_output, tensor_element_cnt,
+              tensor_offset, CudaStream(), string_buffer.back().get());
+        }
+
+        tensor_offset += tensor_element_cnt;
+      }
+    } else {
+      return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INVALID_ARG,
           (std::string("output '") + name +
-           "' is a scalar which is not supported.")
-              .c_str()));
+           "' must be of type Tensor or List[str].")
+              .c_str());
     }
 
-    responder.ProcessTensor(
-        name, output_dtype, batchn_shape, output_buffer,
-        (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU
-                                              : TRITONSERVER_MEMORY_GPU,
-        (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index());
-
     // PyTorch uses asynchronous execution to run the model. Setting the compute
     // end timestamp immediately after Execute() does not capture the complete
     // model execution time. When the first output buffer is accessed/copied by

From ff103c40e7263076a33e84c16e30e6501143422c Mon Sep 17 00:00:00 2001
From: Hemant Jain <hemantj@nvidia.com>
Date: Thu, 19 May 2022 17:55:14 -0700
Subject: [PATCH 04/76] Enforce ordering of I/O if naming convention is not
 followed (#63)

* Enforce ordering of I/O if naming convention is not followed

* Enforce usage of consistent naming convention for inputs and outputs
- Convention between inputs and outputs can differ

* Use helper function GetNamingConvention
- use switch case
- use c++ style enum

* Use class enum
- cleanup unnecessary code blocks

* Add clarifying comment about atoi usage

* fix typo

* Remove try catch for atoi and use checks for is digit instead
---
 src/libtorch.cc | 233 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 181 insertions(+), 52 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index e22b55a..e39a2ec 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -424,6 +424,14 @@ ModelState::ParseParameters()
   return nullptr;
 }
 
+// The naming convention followed for inputs/outputs in the model configuration.
+// Outputs don't support FORWARD_ARGUMENT.
+enum class NamingConvention {
+  NAMED_INDEX,
+  FORWARD_ARGUMENT,
+  STRICT_CONFIG_ORDERING
+};
+
 //
 // ModelInstanceState
 //
@@ -476,6 +484,11 @@ class ModelInstanceState : public BackendModelInstance {
       std::vector<TRITONBACKEND_Response*>* responses,
       uint64_t* compute_end_ns);
 
+  // Get the naming convention for inputs/outputs from the model configuration
+  TRITONSERVER_Error* GetNamingConvention(
+      NamingConvention* naming_convention,
+      const std::set<std::string>& allowed_io);
+
   ModelState* model_state_;
 
   // The full path to the TorchScript model file.
@@ -597,21 +610,29 @@ ModelInstanceState::ValidateBooleanSequenceControl(
   if (*have_control) {
     std::string deliminator = "__";
     int ip_index = 0;
-    try {
-      int start_pos = tensor_name.find(deliminator);
-      if (start_pos == -1) {
-        throw std::invalid_argument("input must follow naming convention");
-      }
-      ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
-      input_index_map_[tensor_name] = ip_index;
-    }
-    catch (std::exception& ex) {
+    int start_pos = tensor_name.find(deliminator);
+    if (start_pos == -1) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INTERNAL,
           ("input '" + tensor_name +
-           "' does not follow naming convention i.e. <name>__<index>.")
+           "' does not follow <name>__<index> naming convention.")
               .c_str());
     }
+
+    // check if the index part of the name is not an integer
+    std::string index_str = tensor_name.substr(start_pos + 2);
+    for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+      if (std::isdigit(*itr) == 0) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("input '" + tensor_name +
+             "' does not follow <name>__<index> naming convention.")
+                .c_str());
+      }
+    }
+
+    ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
+    input_index_map_[tensor_name] = ip_index;
   }
 
   return nullptr;  // success
@@ -631,21 +652,29 @@ ModelInstanceState::ValidateTypedSequenceControl(
   if (*have_control) {
     std::string deliminator = "__";
     int ip_index = 0;
-    try {
-      int start_pos = tensor_name.find(deliminator);
-      if (start_pos == -1) {
-        throw std::invalid_argument("input must follow naming convention");
-      }
-      ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
-      input_index_map_[tensor_name] = ip_index;
-    }
-    catch (std::exception& ex) {
+    int start_pos = tensor_name.find(deliminator);
+    if (start_pos == -1) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INTERNAL,
           ("input '" + tensor_name +
-           "' does not follow naming convention i.e. <name>__<index>.")
+           "' does not follow <name>__<index> naming convention.")
               .c_str());
     }
+
+    // check if the index part of the name is not an integer
+    std::string index_str = tensor_name.substr(start_pos + 2);
+    for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+      if (std::isdigit(*itr) == 0) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("input '" + tensor_name +
+             "' does not follow <name>__<index> naming convention.")
+                .c_str());
+      }
+    }
+
+    ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
+    input_index_map_[tensor_name] = ip_index;
   }
 
   return nullptr;  // success
@@ -727,6 +756,8 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
   }
 
   bool supports_batching = model_state_->MaxBatchSize() > 0;
+  NamingConvention naming_convention;
+  RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs));
 
   for (size_t i = 0; i < ios.ArraySize(); i++) {
     triton::common::TritonJson::Value io;
@@ -740,27 +771,24 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
       // input names since they are the keys for the dictionary
       input_index_map_[io_name] = i;
     } else {
-      // input tensor name must be in 'allowed_inputs' or must follow the naming
-      // convention
-      auto itr = allowed_inputs.find(io_name);
-      if (itr != allowed_inputs.end()) {
-        input_index_map_[io_name] = std::distance(allowed_inputs.begin(), itr);
-      } else {
-        try {
-          int start_pos = io_name.find(deliminator);
-          if (start_pos == -1) {
-            throw std::invalid_argument("input must follow naming convention");
+      switch (naming_convention) {
+        case NamingConvention::FORWARD_ARGUMENT: {
+          auto itr = allowed_inputs.find(io_name);
+          if (itr != allowed_inputs.end()) {
+            input_index_map_[io_name] =
+                std::distance(allowed_inputs.begin(), itr);
           }
+          break;
+        }
+        case NamingConvention::NAMED_INDEX: {
+          int start_pos = io_name.find(deliminator);
           ip_index = std::atoi(io_name.substr(start_pos + 2).c_str());
           input_index_map_[io_name] = ip_index;
+          break;
         }
-        catch (std::exception& ex) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              ("input '" + io_name +
-               "' is neither an input argument to the model nor does it "
-               "follow the naming convention i.e. <name>__<index>.")
-                  .c_str());
+        case NamingConvention::STRICT_CONFIG_ORDERING: {
+          input_index_map_[io_name] = i;
+          break;
         }
       }
     }
@@ -821,6 +849,8 @@ ModelInstanceState::ValidateOutputs()
   }
 
   const bool supports_batching = model_state_->MaxBatchSize() > 0;
+  NamingConvention naming_convention;
+  RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {}));
 
   for (size_t i = 0; i < ios.ArraySize(); i++) {
     triton::common::TritonJson::Value io;
@@ -829,19 +859,18 @@ ModelInstanceState::ValidateOutputs()
     // Validate name
     std::string io_name;
     RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-    try {
-      int start_pos = io_name.find(deliminator);
-      if (start_pos == -1) {
-        throw std::invalid_argument("output must follow naming convention");
+    switch (naming_convention) {
+      case NamingConvention::NAMED_INDEX: {
+        int start_pos = io_name.find(deliminator);
+        op_index = std::atoi(io_name.substr(start_pos + 2).c_str());
+        break;
       }
-      op_index = std::atoi(io_name.substr(start_pos + 2).c_str());
-    }
-    catch (std::exception& ex) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("output '" + io_name +
-           "' does not follow naming convention i.e. <name>__<index>.")
-              .c_str());
+      case NamingConvention::STRICT_CONFIG_ORDERING: {
+        op_index = i;
+        break;
+      }
+      default:
+        break;
     }
 
     // Validate data type
@@ -1251,9 +1280,9 @@ ModelInstanceState::Execute(
       output_tensors->push_back(model_outputs_);
     } else {
       throw std::invalid_argument(
-          "output must be of type Tensor, List[str] or Tuple "
-          "containing one of these two types. It should not be a List / "
-          "Dictionary of Tensors or a Scalar");
+          "output must be of type Tensor, List[str] or Tuple containing one of "
+          "these two types. It should not be a List / Dictionary of Tensors or "
+          "a Scalar");
     }
   }
   catch (std::exception& ex) {
@@ -1265,6 +1294,106 @@ ModelInstanceState::Execute(
   }
 }
 
+TRITONSERVER_Error*
+ModelInstanceState::GetNamingConvention(
+    NamingConvention* naming_convention,
+    const std::set<std::string>& allowed_ios)
+{
+  // Rules for (non-Dictionary) input tensor names:
+  // 1. Must be in 'allowed_inputs' (arguments in the forward function)
+  // 2. Must follow the naming convention i.e. <name>__<index>
+  // 3. If neither of the above conditions are satisfied, enforce strict
+  // ordering of model inputs.
+  //
+  // Rules for output tensor names:
+  // 1. Must follow the naming convention i.e. <name>__<index>
+  // 2. If not, we enforce strict ordering of model outputs.
+  std::string deliminator = "__";
+  std::string io_kind = "input";
+  *naming_convention = NamingConvention::FORWARD_ARGUMENT;
+
+  // symbolizes output
+  if (allowed_ios.size() == 0) {
+    io_kind = "output";
+    *naming_convention = NamingConvention::NAMED_INDEX;
+  }
+
+  triton::common::TritonJson::Value ios;
+  RETURN_IF_ERROR(
+      model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios));
+
+  if (io_kind == "input") {
+    for (size_t i = 0; i < ios.ArraySize(); i++) {
+      triton::common::TritonJson::Value io;
+      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+      // Validate name
+      std::string io_name;
+      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+      auto itr = allowed_ios.find(io_name);
+      if (itr == allowed_ios.end()) {
+        *naming_convention = NamingConvention::NAMED_INDEX;
+        break;
+      }
+    }
+  }
+
+  // If not, check if inputs follow INDEX
+  if (*naming_convention == NamingConvention::NAMED_INDEX) {
+    for (size_t i = 0; i < ios.ArraySize(); i++) {
+      triton::common::TritonJson::Value io;
+      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+      // Validate name
+      std::string io_name;
+      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+      int start_pos = io_name.find(deliminator);
+      if (start_pos == -1) {
+        *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
+        break;
+      } else {
+        // check if the index part of the name is not an integer
+        std::string index_str = io_name.substr(start_pos + 2);
+        bool is_int = true;
+        for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+          if (std::isdigit(*itr) == 0) {
+            is_int = false;
+          }
+        }
+
+        if (!is_int) {
+          if (io_kind == "input") {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_WARN,
+                ("input '" + io_name +
+                 "' or previous input(s) are neither an input argument to the "
+                 "model '" +
+                 model_state_->Name() +
+                 "' nor do they follow the <name>__<index> naming convention. "
+                 "Falling back to enforcing strict ordering from model "
+                 "configuration.")
+                    .c_str());
+          } else {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_WARN,
+                ("output '" + io_name +
+                 "' or previous output(s) of the model '" +
+                 model_state_->Name() +
+                 "' do not follow the <name>__<index> naming convention. "
+                 "Falling back to enforcing strict ordering from model "
+                 "configuration.")
+                    .c_str());
+          }
+          *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
+          break;
+        }
+      }
+    }
+  }
+
+  return nullptr;  // success
+}
+
 // This function will return a tensor's contents as a contiguous
 // chunk in system memory. In some cases this will require copying the data.
 // If that  happens, 'contiguous_buffer' will be set to hold the contiguous

From df900ce3d3c03e7f34ce1e9518c09cc2bad520a5 Mon Sep 17 00:00:00 2001
From: Tanmay Verma <tanmay2592@gmail.com>
Date: Tue, 24 May 2022 13:42:51 -0700
Subject: [PATCH 05/76] Add nvtx markers which can be useful in perf profiling
 (#64)

---
 CMakeLists.txt  | 5 +++++
 src/libtorch.cc | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 909148f..a5ea654 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,7 @@ project(tritonpytorchbackend LANGUAGES C CXX)
 
 option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
 option(TRITON_PYTORCH_ENABLE_TORCHTRT "Enable TorchTRT support" OFF)
 option(TRITON_PYTORCH_ENABLE_TORCHVISION "Enable Torchvision support" ON)
 
@@ -120,6 +121,10 @@ else()
   endif()
 endif() # TRITON_ENABLE_GPU
 
+if(${TRITON_ENABLE_NVTX})
+  add_definitions(-DTRITON_ENABLE_NVTX=1)
+endif() # TRITON_ENABLE_NVTX
+
 #
 # Shared library implementing the Triton Backend API
 #
diff --git a/src/libtorch.cc b/src/libtorch.cc
index e39a2ec..a142f72 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -33,6 +33,7 @@
 #include "triton/backend/backend_model.h"
 #include "triton/backend/backend_model_instance.h"
 #include "triton/backend/backend_output_responder.h"
+#include "triton/common/nvtx.h"
 #include "triton/core/tritonbackend.h"
 
 #ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
@@ -307,7 +308,6 @@ ModelState::ParseParameters()
         TRITONSERVER_ErrorDelete(err);
       }
     }
-
     LOG_MESSAGE(
         TRITONSERVER_LOG_INFO,
         (std::string("Inference Mode is ") +
@@ -926,6 +926,8 @@ ModelInstanceState::ProcessRequests(
        std::to_string(request_count) + " requests")
           .c_str());
 
+  NVTX_RANGE(nvtx_, "ProcessRequests " + Name());
+
   uint64_t exec_start_ns = 0;
   SET_TIMESTAMP(exec_start_ns);
 
@@ -1188,6 +1190,8 @@ ModelInstanceState::Execute(
     std::vector<torch::jit::IValue>* input_tensors,
     std::vector<torch::jit::IValue>* output_tensors)
 {
+  NVTX_RANGE(nvtx_, "Execute " + Name());
+
   torch::jit::IValue model_outputs_;
 
   try {
@@ -1758,6 +1762,8 @@ ModelInstanceState::ReadOutputTensors(
     TRITONBACKEND_Request** requests, const uint32_t request_count,
     std::vector<TRITONBACKEND_Response*>* responses, uint64_t* compute_end_ns)
 {
+  NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name());
+
   BackendOutputResponder responder(
       requests, request_count, responses, model_state_->TritonMemoryManager(),
       model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(),

From b8baa9310e06cb486dfb962d1f7ac3a8314a5ea2 Mon Sep 17 00:00:00 2001
From: Hemant Jain <hemantj@nvidia.com>
Date: Wed, 22 Jun 2022 16:06:32 -0700
Subject: [PATCH 06/76] Add jpeg CV deps to PyTorch (#65)

- Needed by Torchvision
---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5ea654..41aa7de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -182,6 +182,7 @@ set(OPENCV_LIBS
     "libopencv_imgproc.so"
     "libopencv_core.so"
     "libpng16.so"
+    "libjpeg.so"
 )
 
 if (${TRITON_PYTORCH_DOCKER_BUILD})
@@ -220,6 +221,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgproc.so.3.4.11 libopencv_imgproc.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_core.so.3.4.11 libopencv_core.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0 libpng16.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so
     COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_def.so; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_def.so; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx2.so; fi"
@@ -439,6 +441,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
         COMMAND ln -sf libopencv_imgproc.so libopencv_imgproc.so.${OPENCV_VERSION}
         COMMAND ln -sf libopencv_core.so libopencv_core.so.${OPENCV_VERSION}
         COMMAND ln -sf libpng16.so libpng16.so.16
+        COMMAND ln -sf libjpeg.so libjpeg.so.8
         RESULT_VARIABLE LINK_STATUS
         WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
       if(LINK_STATUS AND NOT LINK_STATUS EQUAL 0)

From f95175762771cb339a09ba1ff61b214f68973274 Mon Sep 17 00:00:00 2001
From: Hemant Jain <hemantj@nvidia.com>
Date: Thu, 23 Jun 2022 15:37:28 -0700
Subject: [PATCH 07/76] Fix intel mkl issue that causes segfault (#66)

* Fix intel mkl issue that causes segfault
- during destruction
- causes freeze during inference

* Add comment
---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41aa7de..e9fe9ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,6 +185,9 @@ set(OPENCV_LIBS
     "libjpeg.so"
 )
 
+# The patchelf commands ensure the MKL libraries are loaded correctly during runtime
+# Without these, the framework/backend complains of missing libraries / symbols and 
+# in some cases leads to segmentation faults.
 if (${TRITON_PYTORCH_DOCKER_BUILD})
   string(REPLACE ";" " " CONDA_LIBS_STR "${CONDA_LIBS}")
 
@@ -228,6 +231,10 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx2.so; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx512.so; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx512.so; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_vml_def.so; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_intel_thread.so libmkl_vml_def.so; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_vml_def.so; fi"
+    COMMAND /bin/sh -c "patchelf --add-needed libmkl_intel_lp64.so libmkl_intel_thread.so"
     COMMAND docker rm pytorch_backend_ptlib
     COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}"
     VERBATIM

From 858be505819611ac5bd6367491c3c260e80ef416 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Fri, 24 Jun 2022 08:57:10 -0700
Subject: [PATCH 08/76] Adding verification for link creation (#67)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9fe9ce..b5c4754 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,7 +234,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_vml_def.so; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_intel_thread.so libmkl_vml_def.so; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_vml_def.so; fi"
-    COMMAND /bin/sh -c "patchelf --add-needed libmkl_intel_lp64.so libmkl_intel_thread.so"
+    COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so ]; then patchelf --add-needed libmkl_intel_lp64.so libmkl_intel_thread.so; fi"
     COMMAND docker rm pytorch_backend_ptlib
     COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}"
     VERBATIM

From 663ee99e1b374c0722e71175d28e05e968f49630 Mon Sep 17 00:00:00 2001
From: "Jeffery (Zeyu) Zhao" <zzy8200@gmail.com>
Date: Wed, 6 Jul 2022 08:00:59 +0800
Subject: [PATCH 09/76] Add CUDA cache cleaning flag to pytorch backend (#61)

* Add CUDA cache cleaning flag to pytorch backend

* minor fixes

minor code formatting change per review

* Add more notes to README
---
 README.md       | 17 +++++++++++++++++
 src/libtorch.cc | 42 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4b41c1a..d5050f6 100644
--- a/README.md
+++ b/README.md
@@ -179,6 +179,23 @@ key: "ENABLE_WEIGHT_SHARING"
 }
 ```
 
+* `ENABLE_CACHE_CLEANING`: Boolean flag to enable CUDA cache cleaning after each model execution.
+If not specified, cache cleaning is disabled. This flag has no effect if model is on CPU.
+Setting this flag to true will negatively impact the performance due to additional CUDA cache
+cleaning operation after each model execution. Therefore, you should only use this flag if you
+serve multiple models with Triton and encounter CUDA out of memory issue during model executions.
+
+The section of model config file specifying this parameter will look like:
+
+```
+parameters: {
+key: "ENABLE_CACHE_CLEANING"
+    value: {
+    string_value:"true"
+    }
+}
+```
+
 * Additional Optimizations: Three additional boolean parameters are available to disable
 certain Torch optimizations that can sometimes cause latency regressions in models with
 complex execution modes and dynamic shapes. If not specified, all are enabled by default.
diff --git a/src/libtorch.cc b/src/libtorch.cc
index a142f72..5449064 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -98,6 +98,7 @@ class ModelState : public BackendModel {
   {
     return enable_nvfuser_pair_;
   }
+  bool EnabledCacheCleaning(){ return enable_cache_cleaning_; }
 
   bool EnabledWeightSharing() { return enable_weight_sharing_; }
 
@@ -114,6 +115,9 @@ class ModelState : public BackendModel {
   // Flag to indicate whether inference mode is enabled. Defaults to false.
   bool enable_inference_mode_;
 
+  // Flag to indicate whether cache clearning after each run is enabled. Defaults to false.
+  bool enable_cache_cleaning_;
+
   // Flag to indicate whether weight sharing is enabled. Defaults to false.
   bool enable_weight_sharing_;
 
@@ -173,7 +177,8 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model), enable_optimized_execution_(true),
-      enable_inference_mode_(false), enable_weight_sharing_(false),
+      enable_inference_mode_(false), enable_cache_cleaning_(false),
+      enable_weight_sharing_(false),
       enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
       enable_jit_executor_pair_({false, true}),
@@ -298,6 +303,25 @@ ModelState::ParseParameters()
          " for model instance '" + Name() + "'")
             .c_str());
 
+    // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then
+    // no update is made to 'enable_cache_cleaning_'.
+    err = ParseParameter(
+        params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("Cache Cleaning is ") +
+         (enable_cache_cleaning_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
     // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made
     // to 'enable_inference_mode_'.
     err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_);
@@ -453,6 +477,9 @@ class ModelInstanceState : public BackendModelInstance {
   void ProcessRequests(
       TRITONBACKEND_Request** requests, const uint32_t request_count);
 
+  // Clear CUDA cache
+  void ClearCache();
+
  private:
   ModelInstanceState(
       ModelState* model_state,
@@ -585,9 +612,8 @@ ModelInstanceState::ModelInstanceState(
   THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
 }
 
-ModelInstanceState::~ModelInstanceState()
+void ModelInstanceState::ClearCache()
 {
-  torch_model_.reset();
 #ifdef TRITON_ENABLE_GPU
   if (device_.is_cuda()) {
     c10::cuda::CUDACachingAllocator::emptyCache();
@@ -595,6 +621,12 @@ ModelInstanceState::~ModelInstanceState()
 #endif  // TRITON_ENABLE_GPU
 }
 
+ModelInstanceState::~ModelInstanceState()
+{
+  torch_model_.reset();
+  ClearCache();
+}
+
 TRITONSERVER_Error*
 ModelInstanceState::ValidateBooleanSequenceControl(
     triton::common::TritonJson::Value& sequence_batching,
@@ -2081,6 +2113,10 @@ TRITONBACKEND_ModelInstanceExecute(
   // specific request.
   instance_state->ProcessRequests(requests, request_count);
 
+  if(model_state->EnabledCacheCleaning()) {
+    instance_state->ClearCache();
+  }
+
   return nullptr;  // success
 }
 

From 3ecda562665cfb29a3584fdce80610450a98d821 Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Thu, 7 Jul 2022 17:25:51 -0700
Subject: [PATCH 10/76] Add reshape+batching and dynamic batching support for
 string I/O (#69)

* reshape+batching and dynamic batching support for string I/O

* Address comment

* Address comment
---
 README.md       |  2 --
 src/libtorch.cc | 84 +++++++++++++++++++------------------------------
 2 files changed, 33 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index d5050f6..da9d391 100644
--- a/README.md
+++ b/README.md
@@ -231,5 +231,3 @@ a List of Strings as input(s) / produces a List of String as output(s). For thes
 Triton allows users to pass String input(s)/recieve String output(s) using the String
 datatype. As a limitation of using List instead of Tensor for String I/O, only for
 1-dimensional input(s)/output(s) are supported for I/O of String type.
-Batching is not allowed for PyTorch models with String I/O. For these models,
-the user must specify `max_batch_size: 0` in the configuration.
diff --git a/src/libtorch.cc b/src/libtorch.cc
index 5449064..336629c 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -535,6 +535,9 @@ class ModelInstanceState : public BackendModelInstance {
 
   // If the input to the tensor is a dictionary of tensors.
   bool is_dict_input_;
+
+  // If the model supports batching.
+  bool supports_batching_;
 };
 
 TRITONSERVER_Error*
@@ -607,6 +610,7 @@ ModelInstanceState::ModelInstanceState(
       expected_input_cnt += 1;
     }
   }
+  supports_batching_ = model_state_->MaxBatchSize() > 0;
 
   THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt));
   THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
@@ -787,7 +791,6 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
         "specified.");
   }
 
-  bool supports_batching = model_state_->MaxBatchSize() > 0;
   NamingConvention naming_convention;
   RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs));
 
@@ -837,8 +840,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
               .c_str());
     }
 
-    // Validate shape for String inputs. Only allow 1 dimension and no
-    // batching.
+    // Validate shape for String inputs. Only allow 1 dimension.
     if (io_dtype == "TYPE_STRING") {
       // If a reshape is provided for the input then use that when
       // validating the model shapes.
@@ -850,7 +852,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
         RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
       }
 
-      if ((dims.size() > 1) || supports_batching) {
+      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
         return TRITONSERVER_ErrorNew(
             TRITONSERVER_ERROR_INTERNAL,
             ("Triton only supports 1 dimensional List of String as input for "
@@ -880,7 +882,6 @@ ModelInstanceState::ValidateOutputs()
         "specified.");
   }
 
-  const bool supports_batching = model_state_->MaxBatchSize() > 0;
   NamingConvention naming_convention;
   RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {}));
 
@@ -917,8 +918,7 @@ ModelInstanceState::ValidateOutputs()
               .c_str());
     }
 
-    // Validate shape for String outputs. Only allow 1 dimension and no
-    // batching.
+    // Validate shape for String outputs. Only allow 1 dimension.
     if (io_dtype == "TYPE_STRING") {
       // If a reshape is provided for the output then use that when
       // validating the model shapes.
@@ -930,7 +930,7 @@ ModelInstanceState::ValidateOutputs()
         RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
       }
 
-      if ((dims.size() > 1) || supports_batching) {
+      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
         return TRITONSERVER_ErrorNew(
             TRITONSERVER_ERROR_INTERNAL,
             ("Triton only supports 1 dimensional List of String as output for "
@@ -1015,7 +1015,7 @@ ModelInstanceState::ProcessRequests(
   for (size_t i = 0; i < request_count; i++) {
     if (max_batch_size > 0) {
       // Retrieve the batch size from one of the inputs, if the model
-      // supports batching, the first dimension size is batch size
+      // supports batching, the first dimension size is batch size.
       TRITONBACKEND_Input* input;
       TRITONSERVER_Error* err =
           TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
@@ -1294,7 +1294,7 @@ ModelInstanceState::Execute(
           if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
             throw std::invalid_argument(
                 "output at index " + std::to_string(op_index) +
-                " must be of type Tensor or List[str], recieved List[" +
+                " must be of type Tensor or List[str], received List[" +
                 list_output.elementType()->str() + "]");
           }
           output_tensors->push_back(m_op);
@@ -1310,7 +1310,7 @@ ModelInstanceState::Execute(
       auto list_output = model_outputs_.toList();
       if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
         throw std::invalid_argument(
-            "output must be of type Tensor or List[str], recieved List[" +
+            "output must be of type Tensor or List[str], received List[" +
             list_output.elementType()->str() + "]");
       }
       output_tensors->push_back(model_outputs_);
@@ -1505,8 +1505,7 @@ GetContiguousInputContent(
 }
 
 void
-FillStringTensor(
-    torch::List<std::string>* input_list, const size_t idx, const size_t cnt)
+FillStringTensor(torch::List<std::string>* input_list, const size_t cnt)
 {
   for (size_t c = 0; c < cnt; ++c) {
     input_list->push_back("");
@@ -1517,9 +1516,8 @@ bool
 SetStringInputTensor(
     torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
     const char* name, const uint32_t buffer_count,
-    const size_t request_element_cnt, const size_t tensor_offset,
-    TRITONBACKEND_Response** response, cudaStream_t stream,
-    const char* host_policy_name)
+    const size_t request_element_cnt, TRITONBACKEND_Response** response,
+    cudaStream_t stream, const char* host_policy_name)
 {
   bool cuda_copy = false;
   size_t element_idx = 0;
@@ -1537,9 +1535,7 @@ SetStringInputTensor(
       stream, &cuda_copy);
   if (err != nullptr) {
     RESPOND_AND_SET_NULL_IF_ERROR(response, err);
-    FillStringTensor(
-        input_list, tensor_offset + element_idx,
-        request_element_cnt - element_idx);
+    FillStringTensor(input_list, request_element_cnt - element_idx);
     return cuda_copy;
   }
 
@@ -1564,9 +1560,6 @@ SetStringInputTensor(
                   std::to_string(element_idx + 1) + " for inference input '" +
                   name + "', expecting " + std::to_string(request_element_cnt))
                   .c_str()));
-      FillStringTensor(
-          input_list, tensor_offset + element_idx,
-          request_element_cnt - element_idx);
       return cuda_copy;
     }
 
@@ -1585,9 +1578,7 @@ SetStringInputTensor(
                   std::to_string(len) + " but only " +
                   std::to_string(content_byte_size) + " bytes available")
                   .c_str()));
-      FillStringTensor(
-          input_list, tensor_offset + element_idx,
-          request_element_cnt - element_idx);
+      FillStringTensor(input_list, request_element_cnt - element_idx);
       return cuda_copy;
     }
 
@@ -1608,9 +1599,9 @@ SetStringInputTensor(
                           " strings for inference input '" + name + "', got " +
                           std::to_string(element_idx))
                           .c_str()));
-    FillStringTensor(
-        input_list, tensor_offset + element_idx,
-        request_element_cnt - element_idx);
+    if (element_idx < request_element_cnt) {
+      FillStringTensor(input_list, request_element_cnt - element_idx);
+    }
   }
 
   return cuda_copy;
@@ -1620,7 +1611,7 @@ bool
 SetStringOutputBuffer(
     torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
     TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
-    const size_t tensor_offset, cudaStream_t stream, std::string* serialized)
+    cudaStream_t stream, std::string* serialized)
 {
   bool cuda_copy = false;
 
@@ -1677,8 +1668,6 @@ ModelInstanceState::SetInputTensors(
     std::vector<torch::jit::IValue>* input_tensors,
     std::vector<BackendMemory*>* input_memories, bool* cuda_copy)
 {
-  const int max_batch_size = model_state_->MaxBatchSize();
-
   // InferenceMode should be used to guard all tensors operations
   torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
 
@@ -1705,7 +1694,7 @@ ModelInstanceState::SetInputTensors(
     // The shape for the entire input patch, [total_batch_size, ...]
     std::vector<int64_t> batchn_shape(
         input_shape, input_shape + input_dims_count);
-    if (max_batch_size != 0) {
+    if (supports_batching_) {
       batchn_shape[0] = total_batch_size;
     }
 
@@ -1735,20 +1724,10 @@ ModelInstanceState::SetInputTensors(
 
 
     if (input_datatype == TRITONSERVER_TYPE_BYTES) {
-      if (batchn_shape.size() != 1) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL, ("Triton only supports 1 dimensional "
-                                          "List of string as input for '" +
-                                          std::string(input_name) + "'")
-                                             .c_str());
-      }
-
       // Create the PyTorch list to hold the strings.
       torch::List<std::string> input_list;
       input_list.reserve(batchn_shape[0]);
 
-      size_t tensor_offset = 0;
-
       for (size_t idx = 0; idx < request_count; idx++) {
         TRITONBACKEND_Input* input;
         RESPOND_AND_SET_NULL_IF_ERROR(
@@ -1767,9 +1746,7 @@ ModelInstanceState::SetInputTensors(
 
         *cuda_copy |= SetStringInputTensor(
             &input_list, input, input_name, buffer_count, batch_element_cnt,
-            tensor_offset, &((*responses)[idx]), CudaStream(),
-            HostPolicyName().c_str());
-        tensor_offset += batch_element_cnt;
+            &((*responses)[idx]), CudaStream(), HostPolicyName().c_str());
       }
 
       (*input_tensors)[input_index_map_[input_name]] = input_list;
@@ -1864,18 +1841,25 @@ ModelInstanceState::ReadOutputTensors(
 
     } else if (output_tensors[op_index].isList()) {
       // Custom handling for string/bytes tensor...
-
       torch::List<torch::jit::IValue> output_list =
           output_tensors[op_index].toList();
 
       // Get output shape
       std::vector<int64_t> batchn_shape{(int64_t)output_list.size()};
 
-      size_t tensor_offset = 0;
-
       for (size_t idx = 0; idx < responses->size(); idx++) {
+        auto& request = requests[idx];
         auto& response = (*responses)[idx];
 
+        if (supports_batching_ != 0) {
+          TRITONBACKEND_Input* input;
+          TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input);
+          const int64_t* shape;
+          TRITONBACKEND_InputProperties(
+              input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+          batchn_shape[0] = shape[0];
+        }
+
         const size_t tensor_element_cnt = GetElementCount(batchn_shape);
 
         // Only need an response tensor for requested outputs.
@@ -1889,10 +1873,8 @@ ModelInstanceState::ReadOutputTensors(
           string_buffer.emplace_back(new std::string());
           cuda_copy |= SetStringOutputBuffer(
               &output_list, &response, response_output, tensor_element_cnt,
-              tensor_offset, CudaStream(), string_buffer.back().get());
+              CudaStream(), string_buffer.back().get());
         }
-
-        tensor_offset += tensor_element_cnt;
       }
     } else {
       return TRITONSERVER_ErrorNew(

From 1f89243397783234d94c0aa206be2e4953d217b7 Mon Sep 17 00:00:00 2001
From: Tanmay Verma <tanmay2592@gmail.com>
Date: Fri, 8 Jul 2022 10:14:10 -0700
Subject: [PATCH 11/76] Use the SetModelConfig backend utility (#71)

---
 src/libtorch.cc | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 336629c..c1d403e 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -159,15 +159,7 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
       triton_model, &auto_complete_config));
   if (auto_complete_config) {
     RETURN_IF_ERROR((*state)->AutoCompleteConfig());
-
-    triton::common::TritonJson::WriteBuffer json_buffer;
-    (*state)->ModelConfig().Write(&json_buffer);
-
-    TRITONSERVER_Message* message;
-    RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(
-        &message, json_buffer.Base(), json_buffer.Size()));
-    RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(
-        triton_model, 1 /* config_version */, message));
+    RETURN_IF_ERROR((*state)->SetModelConfig());
   }
 
   RETURN_IF_ERROR((*state)->ParseParameters());

From 3421d0b04531866e4fa59e0726b7d2fc60274834 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <tabrizian@outlook.com>
Date: Sat, 16 Jul 2022 23:20:20 -0400
Subject: [PATCH 12/76] Fix pytorch forward argument naming convention (#72)

---
 src/libtorch.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index c1d403e..a7b312a 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -506,7 +506,7 @@ class ModelInstanceState : public BackendModelInstance {
   // Get the naming convention for inputs/outputs from the model configuration
   TRITONSERVER_Error* GetNamingConvention(
       NamingConvention* naming_convention,
-      const std::set<std::string>& allowed_io);
+      const std::vector<std::string>& allowed_io);
 
   ModelState* model_state_;
 
@@ -713,7 +713,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
 {
   // Collect all the expected input tensor names and validate that the model
   // configuration specifies only those.
-  std::set<std::string> allowed_inputs;
+  std::vector<std::string> allowed_inputs;
 
   const torch::jit::Method& method = torch_model_->get_method("forward");
   const auto& schema = method.function().getSchema();
@@ -755,7 +755,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
              "Dict(str, Tensor) or input(s) of type Tensor are supported.")
                 .c_str());
       }
-      allowed_inputs.emplace(arguments.at(i).name());
+      allowed_inputs.emplace_back(arguments.at(i).name());
     }
 
     // If all inputs are tensors, match number of expected inputs between model
@@ -800,7 +800,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
     } else {
       switch (naming_convention) {
         case NamingConvention::FORWARD_ARGUMENT: {
-          auto itr = allowed_inputs.find(io_name);
+          auto itr = std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name);
           if (itr != allowed_inputs.end()) {
             input_index_map_[io_name] =
                 std::distance(allowed_inputs.begin(), itr);
@@ -1325,7 +1325,7 @@ ModelInstanceState::Execute(
 TRITONSERVER_Error*
 ModelInstanceState::GetNamingConvention(
     NamingConvention* naming_convention,
-    const std::set<std::string>& allowed_ios)
+    const std::vector<std::string>& allowed_ios)
 {
   // Rules for (non-Dictionary) input tensor names:
   // 1. Must be in 'allowed_inputs' (arguments in the forward function)
@@ -1358,7 +1358,7 @@ ModelInstanceState::GetNamingConvention(
       // Validate name
       std::string io_name;
       RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-      auto itr = allowed_ios.find(io_name);
+      auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name);
       if (itr == allowed_ios.end()) {
         *naming_convention = NamingConvention::NAMED_INDEX;
         break;

From 8ae6cd42cf21cc6f232c341289fcd9bcf3818c16 Mon Sep 17 00:00:00 2001
From: hemantj <hemantj@nvidia.com>
Date: Wed, 6 Jul 2022 09:55:52 -0700
Subject: [PATCH 13/76] Fix path for new directory structure

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b5c4754..5c1a9bd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -215,7 +215,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch
     COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/include include/torch
-    COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/codegen
+    COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/.
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision
     COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_videoio.so.3.4.11 libopencv_videoio.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_highgui.so.3.4.11 libopencv_highgui.so

From 8c1c6bd741814bd7f7074801d7d01b09dd0f5a8a Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Mon, 8 Aug 2022 15:00:37 -0400
Subject: [PATCH 14/76] Fix multiple instance performance (#73)

* Fix multiple instance performance

* Fix cuda stream destruction

* Fix CPU version

* Refactor event capturing

* Review edit

* fix up
---
 src/libtorch.cc       | 230 ++++++++++++++++++++++++++++--------------
 src/libtorch_utils.cc |  14 +++
 src/libtorch_utils.h  |  11 +-
 3 files changed, 174 insertions(+), 81 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index a7b312a..76567dc 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -49,6 +49,7 @@
 
 #ifdef TRITON_ENABLE_GPU
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <cuda_runtime_api.h>
 #endif  // TRITON_ENABLE_GPU
 
@@ -98,9 +99,10 @@ class ModelState : public BackendModel {
   {
     return enable_nvfuser_pair_;
   }
-  bool EnabledCacheCleaning(){ return enable_cache_cleaning_; }
+  bool EnabledCacheCleaning() { return enable_cache_cleaning_; }
 
   bool EnabledWeightSharing() { return enable_weight_sharing_; }
+  const std::vector<std::string>& ModelOutputs() { return output_names_; }
 
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
@@ -115,7 +117,8 @@ class ModelState : public BackendModel {
   // Flag to indicate whether inference mode is enabled. Defaults to false.
   bool enable_inference_mode_;
 
-  // Flag to indicate whether cache clearning after each run is enabled. Defaults to false.
+  // Flag to indicate whether cache cleaning after each run is enabled.
+  // Defaults to false.
   bool enable_cache_cleaning_;
 
   // Flag to indicate whether weight sharing is enabled. Defaults to false.
@@ -138,6 +141,10 @@ class ModelState : public BackendModel {
   std::map<
       std::pair<bool, int64_t>, std::shared_ptr<torch::jit::script::Module>>
       torch_models_;
+
+  // List of all the outputs specified in the output section of model
+  // configuration.
+  std::vector<std::string> output_names_;
 };
 
 TRITONSERVER_Error*
@@ -170,12 +177,27 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model), enable_optimized_execution_(true),
       enable_inference_mode_(false), enable_cache_cleaning_(false),
-      enable_weight_sharing_(false),
-      enable_tensor_fuser_pair_({false, true}),
+      enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
       enable_jit_executor_pair_({false, true}),
       enable_nvfuser_pair_({false, false})
 {
+  output_names_.clear();
+
+  triton::common::TritonJson::Value ios;
+  THROW_IF_BACKEND_INSTANCE_ERROR(ModelConfig().MemberAsArray("output", &ios));
+  for (size_t i = 0; i < ios.ArraySize(); i++) {
+    triton::common::TritonJson::Value io;
+    THROW_IF_BACKEND_INSTANCE_ERROR(ios.IndexAsObject(i, &io));
+
+    // Use names from ModelConfig by reference since the model
+    // config will persist longer than this inference execution.
+    const char* io_name;
+    size_t io_name_len;
+    THROW_IF_BACKEND_INSTANCE_ERROR(
+        io.MemberAsString("name", &io_name, &io_name_len));
+    output_names_.emplace_back(io_name);
+  }
 }
 
 TRITONSERVER_Error*
@@ -497,11 +519,12 @@ class ModelInstanceState : public BackendModelInstance {
       std::vector<torch::jit::IValue>* input_tensors,
       std::vector<BackendMemory*>* input_memories, bool* cuda_copy);
   TRITONSERVER_Error* ReadOutputTensors(
-      size_t total_batch_size, const std::vector<const char*>& output_names,
+      size_t total_batch_size,
       const std::vector<torch::jit::IValue>& output_tensors,
       TRITONBACKEND_Request** requests, const uint32_t request_count,
-      std::vector<TRITONBACKEND_Response*>* responses,
-      uint64_t* compute_end_ns);
+      std::vector<TRITONBACKEND_Response*>* responses);
+  TRITONSERVER_Error* RecordBackendTimestamp(
+      uint64_t* timestamp, void* cuda_event);
 
   // Get the naming convention for inputs/outputs from the model configuration
   TRITONSERVER_Error* GetNamingConvention(
@@ -530,6 +553,13 @@ class ModelInstanceState : public BackendModelInstance {
 
   // If the model supports batching.
   bool supports_batching_;
+
+#ifdef TRITON_ENABLE_GPU
+  // PyTorch stream used for execution of inferences.
+  cudaEvent_t compute_input_start_event_;
+  cudaEvent_t compute_infer_start_event_;
+  cudaEvent_t compute_output_start_event_;
+#endif
 };
 
 TRITONSERVER_Error*
@@ -556,7 +586,23 @@ ModelInstanceState::ModelInstanceState(
       model_state_(model_state), device_(torch::kCPU), is_dict_input_(false)
 {
   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+#ifdef TRITON_ENABLE_GPU
     device_ = torch::Device(torch::kCUDA, DeviceId());
+    // Need to set the CUDA context so that the context that events are
+    // created on match with contexts that events are recorded with.
+    THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
+        cudaSetDevice(DeviceId()), TRITONSERVER_ERROR_INTERNAL,
+        "Failed to set the device"));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
+        cudaEventCreate(&compute_input_start_event_),
+        TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event"));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
+        cudaEventCreate(&compute_infer_start_event_),
+        TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event"));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
+        cudaEventCreate(&compute_output_start_event_),
+        TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event"));
+#endif
   }
 
   THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(
@@ -570,6 +616,7 @@ ModelInstanceState::ModelInstanceState(
     }
   }
 
+
   // If this is a sequence model then make sure that the required
   // inputs are present in the model and have the correct shape and
   // datatype.
@@ -608,7 +655,8 @@ ModelInstanceState::ModelInstanceState(
   THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
 }
 
-void ModelInstanceState::ClearCache()
+void
+ModelInstanceState::ClearCache()
 {
 #ifdef TRITON_ENABLE_GPU
   if (device_.is_cuda()) {
@@ -800,7 +848,8 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
     } else {
       switch (naming_convention) {
         case NamingConvention::FORWARD_ARGUMENT: {
-          auto itr = std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name);
+          auto itr =
+              std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name);
           if (itr != allowed_inputs.end()) {
             input_index_map_[io_name] =
                 std::distance(allowed_inputs.begin(), itr);
@@ -950,6 +999,14 @@ ModelInstanceState::ProcessRequests(
        std::to_string(request_count) + " requests")
           .c_str());
 
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    at::cuda::CUDAStream torch_stream =
+        at::cuda::getStreamFromExternal(stream_, DeviceId());
+    at::cuda::setCurrentCUDAStream(torch_stream);
+#endif
+  }
+
   NVTX_RANGE(nvtx_, "ProcessRequests " + Name());
 
   uint64_t exec_start_ns = 0;
@@ -1003,7 +1060,6 @@ ModelInstanceState::ProcessRequests(
     }
   }
 
-
   for (size_t i = 0; i < request_count; i++) {
     if (max_batch_size > 0) {
       // Retrieve the batch size from one of the inputs, if the model
@@ -1056,6 +1112,15 @@ ModelInstanceState::ProcessRequests(
   std::vector<BackendMemory*> input_memories;
   bool cuda_copy = false;
   std::unique_ptr<BackendInputCollector> collector;
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        ConvertCUDAStatusToTritonError(
+            cudaEventRecord(compute_input_start_event_, stream_),
+            TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
+#endif
+  }
 
   if (!all_response_failed) {
     collector.reset(new BackendInputCollector(
@@ -1070,52 +1135,24 @@ ModelInstanceState::ProcessRequests(
             &cuda_copy));
   }
 
-  // Request to retrieve all model outputs. 'output_names' and
-  // 'output_tensors' are parallel vectors and so must be kept in
-  // sync.
-  std::vector<const char*> output_names;
-  std::vector<torch::jit::IValue> output_tensors;
-  if (!all_response_failed) {
-    triton::common::TritonJson::Value ios;
-    TRITONSERVER_Error* err =
-        model_state_->ModelConfig().MemberAsArray("output", &ios);
-    if (err == nullptr) {
-      for (size_t i = 0; i < ios.ArraySize(); i++) {
-        triton::common::TritonJson::Value io;
-        err = ios.IndexAsObject(i, &io);
-        if (err != nullptr) {
-          break;
-        }
-
-        // Use names from ModelConfig by reference since the model
-        // config will persist longer than this inference execution.
-        const char* io_name;
-        size_t io_name_len;
-        err = io.MemberAsString("name", &io_name, &io_name_len);
-        if (err != nullptr) {
-          break;
-        }
-
-        output_names.emplace_back(io_name);
-      }
-    }
-
-    if (err != nullptr) {
-      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-          responses, request_count, all_response_failed, err);
-      output_names.clear();
-    }
-  }
-
-// Wait for any in-flight input tensor copies to complete.
+  // If the instance kind is not GPU, we need to synchronize the CUDA stream
+  if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 #ifdef TRITON_ENABLE_GPU
-  if (cuda_copy) {
-    cudaStreamSynchronize(CudaStream());
-  }
+    if (cuda_copy) {
+      cudaStreamSynchronize(stream_);
+      cuda_copy = false;
+    }
 #endif
+  }
 
+  std::vector<torch::jit::IValue> output_tensors;
   uint64_t compute_start_ns = 0;
-  SET_TIMESTAMP(compute_start_ns);
+
+  RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+      responses, request_count, all_response_failed,
+      RecordBackendTimestamp(
+          &compute_start_ns,
+          reinterpret_cast<void*>(&compute_infer_start_event_)));
 
   // Run...
   if (!all_response_failed) {
@@ -1135,7 +1172,7 @@ ModelInstanceState::ProcessRequests(
   int max_index = output_tensors.size() - 1;
 
   if (!all_response_failed) {
-    for (const auto& name : output_names) {
+    for (const auto& name : model_state_->ModelOutputs()) {
       int op_index = output_index_map_[name];
       if ((op_index < 0) || (op_index > max_index)) {
         RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
@@ -1155,14 +1192,19 @@ ModelInstanceState::ProcessRequests(
   }
 
   uint64_t compute_end_ns = 0;
+  RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+      responses, request_count, all_response_failed,
+      RecordBackendTimestamp(
+          &compute_end_ns,
+          reinterpret_cast<void*>(&compute_output_start_event_)));
 
   if (!all_response_failed) {
     if (!invalid_index) {
       RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
           responses, request_count, all_response_failed,
           ReadOutputTensors(
-              total_batch_size, output_names, output_tensors, requests,
-              request_count, &responses, &compute_end_ns));
+              total_batch_size, output_tensors, requests, request_count,
+              &responses));
     }
   }
 
@@ -1182,6 +1224,33 @@ ModelInstanceState::ProcessRequests(
     }
   }
 
+  // We don't need an explicit CUDA syncrhonization here since we have already
+  // synchronized the stream in the ReadOutputTensors function.
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    float compute_input_duration = 0;
+    float compute_infer_duration = 0;
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        ConvertCUDAStatusToTritonError(
+            cudaEventElapsedTime(
+                &compute_input_duration, compute_input_start_event_,
+                compute_infer_start_event_),
+            TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"));
+
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        ConvertCUDAStatusToTritonError(
+            cudaEventElapsedTime(
+                &compute_infer_duration, compute_infer_start_event_,
+                compute_output_start_event_),
+            TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"));
+
+    compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
+    compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6);
+#endif
+  }
+
   // Report statistics for each request.
   for (uint32_t r = 0; r < request_count; ++r) {
     auto& request = requests[r];
@@ -1714,7 +1783,6 @@ ModelInstanceState::SetInputTensors(
                                ? options.device(torch::kCUDA, device_.index())
                                : options.device(torch::kCPU);
 
-
     if (input_datatype == TRITONSERVER_TYPE_BYTES) {
       // Create the PyTorch list to hold the strings.
       torch::List<std::string> input_list;
@@ -1758,10 +1826,10 @@ ModelInstanceState::SetInputTensors(
 
 TRITONSERVER_Error*
 ModelInstanceState::ReadOutputTensors(
-    size_t total_batch_size, const std::vector<const char*>& output_names,
+    size_t total_batch_size,
     const std::vector<torch::jit::IValue>& output_tensors,
     TRITONBACKEND_Request** requests, const uint32_t request_count,
-    std::vector<TRITONBACKEND_Response*>* responses, uint64_t* compute_end_ns)
+    std::vector<TRITONBACKEND_Response*>* responses)
 {
   NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name());
 
@@ -1773,8 +1841,8 @@ ModelInstanceState::ReadOutputTensors(
   bool cuda_copy = false;
   // The serialized string buffer must be valid until output copies are done
   std::vector<std::unique_ptr<std::string>> string_buffer;
-  for (size_t idx = 0; idx < output_names.size(); idx++) {
-    std::string name = output_names[idx];
+  for (size_t idx = 0; idx < model_state_->ModelOutputs().size(); idx++) {
+    std::string name = model_state_->ModelOutputs()[idx];
     int op_index = output_index_map_[name];
 
     if (output_tensors[op_index].isTensor()) {
@@ -1875,34 +1943,40 @@ ModelInstanceState::ReadOutputTensors(
            "' must be of type Tensor or List[str].")
               .c_str());
     }
-
-    // PyTorch uses asynchronous execution to run the model. Setting the compute
-    // end timestamp immediately after Execute() does not capture the complete
-    // model execution time. When the first output buffer is accessed/copied by
-    // ProcessTensor(), there is a synchronization that is done to ensure the
-    // data is correctly copied from the output tensor. To avoid overheads of
-    // additional synchronization, we continue to use the default cuda stream.
-    // However the drawback of this is that the compute infer time reported
-    // would be slightly later than it is in reality and the compute output time
-    // reported would be smaller than it is in reality. We allow this because
-    // synchronizing manually negatively impacts performance.
-    if (idx == 0) {
-      SET_TIMESTAMP(*compute_end_ns);
-    }
   }
 
   // Finalize and wait for any pending buffer copies.
   cuda_copy |= responder.Finalize();
 
+  if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 #ifdef TRITON_ENABLE_GPU
-  if (cuda_copy) {
-    cudaStreamSynchronize(stream_);
+    if (cuda_copy) {
+      cudaStreamSynchronize(stream_);
+      cuda_copy = false;
+    }
+#endif
   }
-#endif  // TRITON_ENABLE_GPU
 
   return nullptr;
 }
 
+TRITONSERVER_Error*
+ModelInstanceState::RecordBackendTimestamp(
+    uint64_t* timestamp, void* cuda_event)
+{
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    cudaEvent_t* lcuda_event = reinterpret_cast<cudaEvent_t*>(cuda_event);
+    RETURN_IF_ERROR(ConvertCUDAStatusToTritonError(
+        cudaEventRecord(*lcuda_event, stream_), TRITONSERVER_ERROR_INTERNAL,
+        "Failed to record the event."));
+#endif
+  } else {
+    SET_TIMESTAMP(*timestamp);
+  }
+  return nullptr;
+}
+
 /////////////
 
 extern "C" {
@@ -2087,7 +2161,7 @@ TRITONBACKEND_ModelInstanceExecute(
   // specific request.
   instance_state->ProcessRequests(requests, request_count);
 
-  if(model_state->EnabledCacheCleaning()) {
+  if (model_state->EnabledCacheCleaning()) {
     instance_state->ClearCache();
   }
 
diff --git a/src/libtorch_utils.cc b/src/libtorch_utils.cc
index a554ba9..699c742 100644
--- a/src/libtorch_utils.cc
+++ b/src/libtorch_utils.cc
@@ -149,4 +149,18 @@ ParseParameter(
   return nullptr;
 }
 
+#ifdef TRITON_ENABLE_GPU
+TRITONSERVER_Error*
+ConvertCUDAStatusToTritonError(
+   cudaError_t cuda_error,TRITONSERVER_Error_Code code, const char* msg)
+{
+  if (cuda_error != cudaSuccess) {
+    return TRITONSERVER_ErrorNew(
+        code,
+        (std::string(msg) + ": " + cudaGetErrorString(cuda_error)).c_str());
+  }
+  return nullptr;  // success
+}
+#endif
+
 }}}  // namespace triton::backend::pytorch
diff --git a/src/libtorch_utils.h b/src/libtorch_utils.h
index e112037..a8f0c0d 100644
--- a/src/libtorch_utils.h
+++ b/src/libtorch_utils.h
@@ -51,9 +51,14 @@ std::pair<bool, torch::ScalarType> ConvertDataTypeToTorchType(
 std::pair<bool, torch::ScalarType> ModelConfigDataTypeToTorchType(
     const std::string& data_type_str);
 
-// If the key 'mkey' is present in 'params' then update 'value' with the value
-// associated with that key. If 'mkey' is not present in 'params' then no update
-// is made to 'value'.
+#ifdef TRITON_ENABLE_GPU
+TRITONSERVER_Error* ConvertCUDAStatusToTritonError(
+    cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg);
+#endif
+
+// If the key 'mkey' is present in 'params' then update 'value' with the
+// value associated with that key. If 'mkey' is not present in 'params' then
+// no update is made to 'value'.
 TRITONSERVER_Error* ParseParameter(
     triton::common::TritonJson::Value& params, const std::string& mkey,
     bool* value);

From bee8fde75ce8400c26cf7fe84df6f2cb7be34437 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Tue, 9 Aug 2022 12:48:20 -0400
Subject: [PATCH 15/76] Fix CPU only build (#75)

---
 src/libtorch.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 76567dc..cc3ab55 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -554,12 +554,9 @@ class ModelInstanceState : public BackendModelInstance {
   // If the model supports batching.
   bool supports_batching_;
 
-#ifdef TRITON_ENABLE_GPU
-  // PyTorch stream used for execution of inferences.
   cudaEvent_t compute_input_start_event_;
   cudaEvent_t compute_infer_start_event_;
   cudaEvent_t compute_output_start_event_;
-#endif
 };
 
 TRITONSERVER_Error*
@@ -616,7 +613,6 @@ ModelInstanceState::ModelInstanceState(
     }
   }
 
-
   // If this is a sequence model then make sure that the required
   // inputs are present in the model and have the correct shape and
   // datatype.

From 0220e01259697a691b552fd9a0553b4452281f17 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Mon, 15 Aug 2022 22:28:37 -0400
Subject: [PATCH 16/76] Fix stream synchronization (#77)

---
 src/libtorch.cc | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index cc3ab55..4cb83d2 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1131,15 +1131,12 @@ ModelInstanceState::ProcessRequests(
             &cuda_copy));
   }
 
-  // If the instance kind is not GPU, we need to synchronize the CUDA stream
-  if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 #ifdef TRITON_ENABLE_GPU
-    if (cuda_copy) {
-      cudaStreamSynchronize(stream_);
-      cuda_copy = false;
-    }
-#endif
+  if (cuda_copy) {
+    cudaStreamSynchronize(stream_);
+    cuda_copy = false;
   }
+#endif
 
   std::vector<torch::jit::IValue> output_tensors;
   uint64_t compute_start_ns = 0;
@@ -1944,14 +1941,13 @@ ModelInstanceState::ReadOutputTensors(
   // Finalize and wait for any pending buffer copies.
   cuda_copy |= responder.Finalize();
 
-  if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 #ifdef TRITON_ENABLE_GPU
-    if (cuda_copy) {
-      cudaStreamSynchronize(stream_);
-      cuda_copy = false;
-    }
+  // We have to always synchronize the stream. This is to make sure that
+  // the events on the cuda stream are synchronized. Otherwise, the events
+  // are only guaranteed to be synchronized if the model provides the output
+  // on GPU.
+  cudaStreamSynchronize(stream_);
 #endif
-  }
 
   return nullptr;
 }

From 5477b119214a066ba171d5b345c2e068998e219d Mon Sep 17 00:00:00 2001
From: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
Date: Wed, 17 Aug 2022 14:34:49 -0700
Subject: [PATCH 17/76] Fix possible "double send" of responses. (#79)

* [DO NOT MERGE] WAR possible segfault

* Add comment
---
 src/libtorch.cc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 4cb83d2..4e327a3 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1221,23 +1221,25 @@ ModelInstanceState::ProcessRequests(
   // synchronized the stream in the ReadOutputTensors function.
   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 #ifdef TRITON_ENABLE_GPU
+    // [FIXME] in the case of cudaEventElapsedTime failure, should handle
+    // stats reporting more gracefully as the durations are inaccurate
     float compute_input_duration = 0;
     float compute_infer_duration = 0;
-    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-        responses, request_count, all_response_failed,
+    LOG_IF_ERROR(
         ConvertCUDAStatusToTritonError(
             cudaEventElapsedTime(
                 &compute_input_duration, compute_input_start_event_,
                 compute_infer_start_event_),
-            TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"));
+            TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
+            "Failed to capture elapsed time");
 
-    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-        responses, request_count, all_response_failed,
+    LOG_IF_ERROR(
         ConvertCUDAStatusToTritonError(
             cudaEventElapsedTime(
                 &compute_infer_duration, compute_infer_start_event_,
                 compute_output_start_event_),
-            TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"));
+            TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
+            "Failed to capture elapsed time");
 
     compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
     compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6);

From 935f4a5afbbece6d79dd9114eff0bf06f2c849f4 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Thu, 1 Sep 2022 15:01:37 -0400
Subject: [PATCH 18/76] Remove PyTorch multiple instance known issue (#80)

---
 README.md | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/README.md b/README.md
index da9d391..616a204 100644
--- a/README.md
+++ b/README.md
@@ -218,14 +218,6 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
     state and a restart of the server may be required to continue serving
     successfully.
 
-* Multiple instances of the PyTorch model on GPU do not always
-  increase performance. Due to thread specific caching in PyTorch, using
-  multiple instances of the model interact negatively. See
-  [here](https://github.com/pytorch/pytorch/issues/27902) for more details.
-  Setting the parameter `DISABLE_OPTIMIZED_EXECUTION` to "true" in the model
-  configuration may help in some cases to avoid these negative interactions
-  due to model specific caching and increase multiple instance performance.
-
 * PyTorch does not support Tensor of Strings but it does support models that accept
 a List of Strings as input(s) / produces a List of String as output(s). For these models
 Triton allows users to pass String input(s)/recieve String output(s) using the String

From f85fbab31856a118d1bcc5ae2176dc164cf2872a Mon Sep 17 00:00:00 2001
From: holidaydrien <adrien.morisot@gmail.com>
Date: Tue, 25 Oct 2022 20:34:39 -0400
Subject: [PATCH 19/76] fix typo (#81)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 616a204..e39152a 100644
--- a/README.md
+++ b/README.md
@@ -220,6 +220,6 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
 
 * PyTorch does not support Tensor of Strings but it does support models that accept
 a List of Strings as input(s) / produces a List of String as output(s). For these models
-Triton allows users to pass String input(s)/recieve String output(s) using the String
+Triton allows users to pass String input(s)/receive String output(s) using the String
 datatype. As a limitation of using List instead of Tensor for String I/O, only for
 1-dimensional input(s)/output(s) are supported for I/O of String type.

From 4a971e6b6789310609ca84cf1c532084c1c6edc9 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Wed, 23 Nov 2022 15:43:58 -0800
Subject: [PATCH 20/76] Update libraries path for PyTorch backend  (#86)

* Update 'libtorch' dependencies list

* Update libraries and versions to sutisfy the dependency needs

* Update library path

* Remove ilp64 and iomp5 from build (#85)

* Update library path

* Remove ilp64 and iomp5 from build

Co-authored-by: Misha Chornyi <mchornyi@nvidia.com>

Co-authored-by: Iman Tabrizian <iman.tabrizian@gmail.com>
---
 CMakeLists.txt | 103 +++++++++++++++++++++++++++----------------------
 1 file changed, 56 insertions(+), 47 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c1a9bd..ff89da2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -155,23 +155,22 @@ endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
 
 if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
   set(LIBS_ARCH "aarch64")
-  set(CONDA_LIBS
+  set(LIBTORCH_LIBS
       "libopenblas.so.0"
   )
 else()
   set(LIBS_ARCH "x86_64")
-  set(CONDA_LIBS
-      "libmkl_core.so"
-      "libmkl_gnu_thread.so"
-      "libmkl_intel_lp64.so"
-      "libmkl_intel_thread.so"
-      "libmkl_def.so"
-      "libmkl_vml_def.so"
-      "libmkl_rt.so"
-      "libmkl_avx2.so"
-      "libmkl_avx512.so"
-      "libmkl_sequential.so"
-      "libomp.so"
+  set(LIBTORCH_LIBS
+    "libmkl_avx2.so.1"
+    "libmkl_avx512.so.1"
+    "libmkl_core.so.1"
+    "libmkl_def.so.1"
+    "libmkl_gnu_thread.so.1"
+    "libmkl_intel_lp64.so.1"
+    "libmkl_intel_thread.so.1"
+    "libmkl_rt.so.1"
+    "libmkl_sequential.so.1"
+    "libmkl_vml_def.so.1"
   )
 endif()
 set(OPENCV_LIBS
@@ -180,7 +179,10 @@ set(OPENCV_LIBS
     "libopencv_highgui.so"
     "libopencv_imgcodecs.so"
     "libopencv_imgproc.so"
-    "libopencv_core.so"
+    "libopencv_core.so" 
+    "libopencv_calib3d.so"
+    "libopencv_flann.so"
+    "libopencv_features2d.so"
     "libpng16.so"
     "libjpeg.so"
 )
@@ -189,12 +191,12 @@ set(OPENCV_LIBS
 # Without these, the framework/backend complains of missing libraries / symbols and 
 # in some cases leads to segmentation faults.
 if (${TRITON_PYTORCH_DOCKER_BUILD})
-  string(REPLACE ";" " " CONDA_LIBS_STR "${CONDA_LIBS}")
+  string(REPLACE ";" " " LIBTORCH_LIBS_STR "${LIBTORCH_LIBS}")
 
   add_custom_command(
     OUTPUT
       ${PT_LIBS}
-      ${CONDA_LIBS}
+      ${LIBTORCH_LIBS}
       ${OPENCV_LIBS}
       LICENSE.pytorch
       include/torch
@@ -203,43 +205,47 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE}
     COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true
     COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE}
-    COMMAND /bin/sh -c "for i in ${CONDA_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/opt/conda/lib/$i $i ; done"
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libc10.so libc10.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libc10_cuda.so libc10_cuda.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch.so libtorch.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/build/libtorchvision.so libtorchvision.so
-    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
+    COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/usr/local/lib/$i $i ; done"
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libc10.so libc10.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libc10_cuda.so libc10_cuda.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch.so libtorch.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/include include/torch
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/include include/torch
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/.
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_videoio.so.3.4.11 libopencv_videoio.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_highgui.so.3.4.11 libopencv_highgui.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_video.so.3.4.11 libopencv_video.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgcodecs.so.3.4.11 libopencv_imgcodecs.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgproc.so.3.4.11 libopencv_imgproc.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_core.so.3.4.11 libopencv_core.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_videoio.so libopencv_videoio.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_highgui.so libopencv_highgui.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_video.so libopencv_video.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_imgcodecs.so libopencv_imgcodecs.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_imgproc.so libopencv_imgproc.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_core.so libopencv_core.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_calib3d.so libopencv_calib3d.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_features2d.so libopencv_features2d.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_flann.so libopencv_flann.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0 libpng16.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so
-    COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_def.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_def.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx2.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx2.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx512.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx512.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_vml_def.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_intel_thread.so libmkl_vml_def.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_vml_def.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so ]; then patchelf --add-needed libmkl_intel_lp64.so libmkl_intel_thread.so; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx2.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx512.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx512.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_vml_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_intel_thread.so.1 libmkl_vml_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_vml_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so.1 ]; then patchelf --add-needed libmkl_intel_lp64.so.1 libmkl_intel_thread.so.1; fi"
     COMMAND docker rm pytorch_backend_ptlib
     COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}"
     VERBATIM
   )
-  add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS})
+  add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS})
   add_library(ptlib SHARED IMPORTED GLOBAL)
   add_dependencies(ptlib ptlib_target)
 
@@ -405,7 +411,7 @@ install(
 
 if (${TRITON_PYTORCH_DOCKER_BUILD})
   set(PT_LIB_PATHS "")
-  FOREACH(plib ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS})
+  FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS})
     set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}")
   ENDFOREACH(plib)
 
@@ -424,7 +430,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     )
   endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
 
-  FOREACH(plib ${PT_LIBS} ${CONDA_LIBS} ${OPENCV_LIBS})
+  FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS})
     install(
       CODE
         "EXECUTE_PROCESS(
@@ -437,7 +443,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     )
   ENDFOREACH(plib)
 
-  set(OPENCV_VERSION "3.4")
+  set(OPENCV_VERSION "406")
   install(
     CODE
       "EXECUTE_PROCESS(
@@ -447,6 +453,9 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
         COMMAND ln -sf libopencv_imgcodecs.so libopencv_imgcodecs.so.${OPENCV_VERSION}
         COMMAND ln -sf libopencv_imgproc.so libopencv_imgproc.so.${OPENCV_VERSION}
         COMMAND ln -sf libopencv_core.so libopencv_core.so.${OPENCV_VERSION}
+        COMMAND ln -sf libopencv_calib3d.so libopencv_calib3d.so.${OPENCV_VERSION}
+        COMMAND ln -sf libopencv_features2d.so libopencv_features2d.so.${OPENCV_VERSION}
+        COMMAND ln -sf libopencv_flann.so libopencv_flann.so.${OPENCV_VERSION}
         COMMAND ln -sf libpng16.so libpng16.so.16
         COMMAND ln -sf libjpeg.so libjpeg.so.8
         RESULT_VARIABLE LINK_STATUS

From 9e9a9a6c1fcdc9e7b54d728f686ebf9cbe336592 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <mccormick.codes@gmail.com>
Date: Tue, 6 Dec 2022 16:20:52 -0800
Subject: [PATCH 21/76] re-enable nvfuser (#87)

* re-enable nvfuser

* WARN->INFO, formatting
---
 src/libtorch.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 4e327a3..66d2908 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -432,7 +432,6 @@ ModelState::ParseParameters()
               .c_str());
     }
 
-    // TODO Re-enable NvFuser once fixed
     // If 'ENABLE_NVFUSER' is not present in 'parameters' then no
     // update is made to 'enable_nvfuser'.
     bool enable_nvfuser = false;
@@ -448,11 +447,9 @@ ModelState::ParseParameters()
         TRITONSERVER_ErrorDelete(err);
       }
     } else {
-      // Override, disable NvFuser till fixed
-      enable_nvfuser = false;
       enable_nvfuser_pair_ = {true, enable_nvfuser};
       LOG_MESSAGE(
-          TRITONSERVER_LOG_WARN, (std::string("NvFuser is ") +
+          TRITONSERVER_LOG_INFO, (std::string("NvFuser is ") +
                                   (enable_nvfuser ? "enabled" : "disabled") +
                                   " for model instance '" + Name() + "'")
                                      .c_str());
@@ -1231,7 +1228,7 @@ ModelInstanceState::ProcessRequests(
                 &compute_input_duration, compute_input_start_event_,
                 compute_infer_start_event_),
             TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
-            "Failed to capture elapsed time");
+        "Failed to capture elapsed time");
 
     LOG_IF_ERROR(
         ConvertCUDAStatusToTritonError(
@@ -1239,7 +1236,7 @@ ModelInstanceState::ProcessRequests(
                 &compute_infer_duration, compute_infer_start_event_,
                 compute_output_start_event_),
             TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
-            "Failed to capture elapsed time");
+        "Failed to capture elapsed time");
 
     compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
     compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6);

From f81ec73e802db216eda108763364ec4796c16ebb Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Sat, 31 Dec 2022 05:56:30 +0800
Subject: [PATCH 22/76] Remove unused input_memories variable (#89)

---
 src/libtorch.cc | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 66d2908..90972f0 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -513,8 +513,7 @@ class ModelInstanceState : public BackendModelInstance {
       const uint32_t request_count,
       std::vector<TRITONBACKEND_Response*>* responses,
       BackendInputCollector* collector, std::vector<const char*>* input_names,
-      std::vector<torch::jit::IValue>* input_tensors,
-      std::vector<BackendMemory*>* input_memories, bool* cuda_copy);
+      std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy);
   TRITONSERVER_Error* ReadOutputTensors(
       size_t total_batch_size,
       const std::vector<torch::jit::IValue>& output_tensors,
@@ -1102,7 +1101,6 @@ ModelInstanceState::ProcessRequests(
 
   std::vector<const char*> input_names;
   std::vector<torch::jit::IValue> input_tensors;
-  std::vector<BackendMemory*> input_memories;
   bool cuda_copy = false;
   std::unique_ptr<BackendInputCollector> collector;
   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
@@ -1124,8 +1122,7 @@ ModelInstanceState::ProcessRequests(
         responses, request_count, all_response_failed,
         SetInputTensors(
             total_batch_size, requests, request_count, &responses,
-            collector.get(), &input_names, &input_tensors, &input_memories,
-            &cuda_copy));
+            collector.get(), &input_names, &input_tensors, &cuda_copy));
   }
 
 #ifdef TRITON_ENABLE_GPU
@@ -1149,14 +1146,6 @@ ModelInstanceState::ProcessRequests(
     Execute(&responses, request_count, &input_tensors, &output_tensors);
   }
 
-  // Free BackendMemory used for inputs
-  for (BackendMemory* mem : input_memories) {
-    if (mem != nullptr) {
-      delete mem;
-    }
-  }
-  input_memories.clear();
-
   // Verify output indices are valid with number of outputs after execution
   bool invalid_index = false;
   int max_index = output_tensors.size() - 1;
@@ -1718,8 +1707,7 @@ ModelInstanceState::SetInputTensors(
     const uint32_t request_count,
     std::vector<TRITONBACKEND_Response*>* responses,
     BackendInputCollector* collector, std::vector<const char*>* input_names,
-    std::vector<torch::jit::IValue>* input_tensors,
-    std::vector<BackendMemory*>* input_memories, bool* cuda_copy)
+    std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy)
 {
   // InferenceMode should be used to guard all tensors operations
   torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());

From 2559db96d5fb9617d9e10b2926158d59cb61b29b Mon Sep 17 00:00:00 2001
From: dyastremsky <58150256+dyastremsky@users.noreply.github.com>
Date: Wed, 8 Feb 2023 11:25:30 -0800
Subject: [PATCH 23/76] Add linear algebra library (#92)

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff89da2..d757874 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,7 @@ set(PT_LIBS
     "libtorch.so"
     "libtorch_cpu.so"
     "libtorch_cuda.so"
+    "libtorch_cuda_linalg.so"
     "libtorch_global_deps.so"
 )
 
@@ -211,6 +212,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch.so libtorch.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so

From c077c862c045f24fec6274163a9913b609f78bcb Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Wed, 8 Feb 2023 15:18:04 -0800
Subject: [PATCH 24/76] Add check for sequence data type (#93)

---
 src/libtorch.cc | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 90972f0..6f23faa 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -741,6 +741,15 @@ ModelInstanceState::ValidateTypedSequenceControl(
       }
     }
 
+    // check if the data type is supported by PyTorch
+    if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("input '" + tensor_name + "' type '" + tensor_datatype +
+           "' is not supported by PyTorch.")
+              .c_str());
+    }
+
     ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
     input_index_map_[tensor_name] = ip_index;
   }

From 4a8a870f0c759ec6b0b23594881ba0ae384b60f3 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <yeahdongcn@gmail.com>
Date: Fri, 10 Feb 2023 09:21:32 +0800
Subject: [PATCH 25/76] Update pytorch docker image tag to 22.12 in README.md
 (#91)

* Update pytorch docker image tag to 22.12 in README.md

* Update the copyright year in README.md
---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index e39152a..c832ae5 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -56,13 +56,13 @@ $ apt-get install patchelf rapidjson-dev python3-dev
 ```
 
 An appropriate PyTorch container from [NGC](https://ngc.nvidia.com) must be used.
-For example, to build a backend that uses the 21.02 version of the PyTorch
+For example, to build a backend that uses the 22.12 version of the PyTorch
 container from NGC:
 
 ```
 $ mkdir build
 $ cd build
-$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:21.02-py3" ..
+$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:22.12-py3" ..
 $ make install
 ```
 
@@ -80,8 +80,8 @@ Currently, Triton requires that a specially patched version of
 PyTorch be used with the PyTorch backend. The full source for
 these PyTorch versions are available as Docker images from
 [NGC](https://ngc.nvidia.com). For example, the PyTorch version
-compatible with the 21.02 release of Triton is available as
-nvcr.io/nvidia/pytorch:21.02-py3.
+compatible with the 22.12 release of Triton is available as
+nvcr.io/nvidia/pytorch:22.12-py3.
 
 Copy over the LibTorch and Torchvision headers and libraries from the
 [PyTorch NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)

From af2d11abd4b5304705cce30bf8f19b3a40e33571 Mon Sep 17 00:00:00 2001
From: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
Date: Mon, 20 Mar 2023 12:32:42 -0700
Subject: [PATCH 26/76] Add ragged batching support (#95)

---
 src/libtorch.cc | 77 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 53 insertions(+), 24 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 6f23faa..0db7c89 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1741,11 +1741,32 @@ ModelInstanceState::SetInputTensors(
 
     input_names->emplace_back(input_name);
 
-    // The shape for the entire input patch, [total_batch_size, ...]
-    std::vector<int64_t> batchn_shape(
-        input_shape, input_shape + input_dims_count);
-    if (supports_batching_) {
-      batchn_shape[0] = total_batch_size;
+    // The shape for the entire input patch,
+    // [total_batch_size, ...] for non-ragged input and
+    // [total_element_count] for ragged input (non-nested tensor)
+    std::vector<int64_t> batchn_shape;
+    if (StateForModel()->IsInputRagged(input_name)) {
+      batchn_shape = std::vector<int64_t>{0};
+      for (size_t idx = 0; idx < request_count; idx++) {
+        TRITONBACKEND_Input* input;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
+        const int64_t* input_shape;
+        uint32_t input_dims_count;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]), TRITONBACKEND_InputProperties(
+                                      input, nullptr, nullptr, &input_shape,
+                                      &input_dims_count, nullptr, nullptr));
+
+        batchn_shape[0] += GetElementCount(input_shape, input_dims_count);
+      }
+    }
+    else {
+      batchn_shape = std::vector<int64_t>(input_shape, input_shape + input_dims_count);
+      if (supports_batching_) {
+        batchn_shape[0] = total_batch_size;
+      }
     }
 
     // The input must be in contiguous CPU/GPU memory.
@@ -1866,28 +1887,36 @@ ModelInstanceState::ReadOutputTensors(
 
       // Output tensors may not reside on the same device as model
       torch::Device tensor_device = output_flat.device();
+      const auto memory_type = (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU
+                                                  : TRITONSERVER_MEMORY_GPU;
+      const auto memory_id = (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index();
+
+      // Batch output doesn't support string data type yet, as it is not trivial
+      // to parse string output
+      const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name);
+      if (batch_output == nullptr) {
+        // Get output shape
+        std::vector<int64_t> batchn_shape;
+        auto shape = output_tensors[op_index].toTensor().sizes();
+        for (auto itr = shape.begin(); itr != shape.end(); itr++) {
+          batchn_shape.push_back(*itr);
+        }
 
-      // Get output shape
-      std::vector<int64_t> batchn_shape;
-      auto shape = output_tensors[op_index].toTensor().sizes();
-      for (auto itr = shape.begin(); itr != shape.end(); itr++) {
-        batchn_shape.push_back(*itr);
-      }
+        if (batchn_shape.size() == 0) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string("output '") + name +
+              "' is a scalar which is not supported.")
+                  .c_str());
+        }
 
-      if (batchn_shape.size() == 0) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("output '") + name +
-             "' is a scalar which is not supported.")
-                .c_str());
+        responder.ProcessTensor(
+            name, output_dtype, batchn_shape, output_buffer,
+            memory_type, memory_id);
+      } else {
+        responder.ProcessBatchOutput(
+          name, *batch_output, output_buffer, memory_type, memory_id);
       }
-
-      responder.ProcessTensor(
-          name, output_dtype, batchn_shape, output_buffer,
-          (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU
-                                                : TRITONSERVER_MEMORY_GPU,
-          (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index());
-
     } else if (output_tensors[op_index].isList()) {
       // Custom handling for string/bytes tensor...
       torch::List<torch::jit::IValue> output_list =

From 7b09af214f575f235bf8f5ca497c10b01e9b5e86 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Tue, 7 Mar 2023 17:02:02 -0500
Subject: [PATCH 27/76] Fix nvfuser for PyTorch 23.03

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d757874..91e2489 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,6 +138,7 @@ set(PT_LIBS
     "libtorch_cuda.so"
     "libtorch_cuda_linalg.so"
     "libtorch_global_deps.so"
+    "libnvfuser_codegen.so"
 )
 
 if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
@@ -213,6 +214,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libnvfuser_codegen.so libnvfuser_codegen.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so

From 588c6ac37036a9d861046b081142063a40ac067e Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <borisfom@users.noreply.github.com>
Date: Thu, 6 Apr 2023 13:14:15 -0700
Subject: [PATCH 28/76] Fix deprecation warning (#97)

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 src/libtorch.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 0db7c89..7e0d288 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1312,12 +1312,12 @@ ModelInstanceState::Execute(
         torch::jit::overrideCanFuseOnCPU(false);
         torch::jit::overrideCanFuseOnGPU(false);
         torch::jit::setTensorExprFuserEnabled(false);
-        torch::jit::RegisterCudaFuseGraph::registerPass(true);
+	torch::jit::fuser::cuda::setEnabled(true);
       } else {
         torch::jit::overrideCanFuseOnCPU(true);
         torch::jit::overrideCanFuseOnGPU(true);
         torch::jit::setTensorExprFuserEnabled(true);
-        torch::jit::RegisterCudaFuseGraph::registerPass(false);
+	torch::jit::fuser::cuda::setEnabled(false);
       }
     }
 

From 2c1889f03176c3886ba4102187adad6e8ee49e63 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Fri, 14 Apr 2023 16:16:57 -0700
Subject: [PATCH 29/76] Add PyTorch 2.0 note (#100)

---
 README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c832ae5..6169758 100644
--- a/README.md
+++ b/README.md
@@ -206,7 +206,7 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
 
     `ENABLE_TENSOR_FUSER`
 
-### Important Note
+### Important Notes
 
 * The execution of PyTorch model on GPU is asynchronous in nature. See
   [here](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution)
@@ -223,3 +223,15 @@ a List of Strings as input(s) / produces a List of String as output(s). For thes
 Triton allows users to pass String input(s)/receive String output(s) using the String
 datatype. As a limitation of using List instead of Tensor for String I/O, only for
 1-dimensional input(s)/output(s) are supported for I/O of String type.
+
+#### PyTorch 2.0
+
+Currently, the
+[PyTorch Backend](https://github.com/triton-inference-server/pytorch_backend)
+relies on LibTorch/TorchScript (C++) which has been deprecated from
+[PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). 
+
+So, users interested in new features introduced in PyTorch 2.0 should try the
+[Python Backend](https://github.com/triton-inference-server/python_backend)
+route instead.
+

From 33c0bc49a9450b5465bf2001cf7d8f6734d5f23d Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Tue, 18 Apr 2023 14:04:04 -0700
Subject: [PATCH 30/76] Remove Pytorch 2.0 note (#101)

---
 README.md | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/README.md b/README.md
index 6169758..0714971 100644
--- a/README.md
+++ b/README.md
@@ -224,14 +224,3 @@ Triton allows users to pass String input(s)/receive String output(s) using the S
 datatype. As a limitation of using List instead of Tensor for String I/O, only for
 1-dimensional input(s)/output(s) are supported for I/O of String type.
 
-#### PyTorch 2.0
-
-Currently, the
-[PyTorch Backend](https://github.com/triton-inference-server/pytorch_backend)
-relies on LibTorch/TorchScript (C++) which has been deprecated from
-[PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). 
-
-So, users interested in new features introduced in PyTorch 2.0 should try the
-[Python Backend](https://github.com/triton-inference-server/python_backend)
-route instead.
-

From d600509ad733648cf5b2e228577bb15921848bcd Mon Sep 17 00:00:00 2001
From: Tanmay Verma <tanmay2592@gmail.com>
Date: Fri, 7 Apr 2023 16:11:37 -0700
Subject: [PATCH 31/76] Update README and versions for 23.04 branch

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0714971..4fe3408 100644
--- a/README.md
+++ b/README.md
@@ -56,13 +56,13 @@ $ apt-get install patchelf rapidjson-dev python3-dev
 ```
 
 An appropriate PyTorch container from [NGC](https://ngc.nvidia.com) must be used.
-For example, to build a backend that uses the 22.12 version of the PyTorch
+For example, to build a backend that uses the 23.04 version of the PyTorch
 container from NGC:
 
 ```
 $ mkdir build
 $ cd build
-$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:22.12-py3" ..
+$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:23.04-py3" ..
 $ make install
 ```
 

From a7a24130a4aa050a7e6317eb3c3335ec2d2ec345 Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Fri, 28 Apr 2023 11:36:43 -0700
Subject: [PATCH 32/76] =?UTF-8?q?Add=20the=20documentation=20for=20the=20i?=
 =?UTF-8?q?ssue=20when=20using=20traced=20model=20in=20multi-=E2=80=A6=20(?=
 =?UTF-8?q?#104)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add the documentation for the issue when using traced model in multi-GPU environment

* Address comment
---
 README.md | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 4fe3408..dfef361 100644
--- a/README.md
+++ b/README.md
@@ -218,9 +218,28 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
     state and a restart of the server may be required to continue serving
     successfully.
 
-* PyTorch does not support Tensor of Strings but it does support models that accept
-a List of Strings as input(s) / produces a List of String as output(s). For these models
-Triton allows users to pass String input(s)/receive String output(s) using the String
-datatype. As a limitation of using List instead of Tensor for String I/O, only for
-1-dimensional input(s)/output(s) are supported for I/O of String type.
-
+* PyTorch does not support Tensor of Strings but it does support models that
+accept a List of Strings as input(s) / produces a List of String as output(s).
+For these models Triton allows users to pass String input(s)/receive String
+output(s) using the String datatype. As a limitation of using List instead of
+Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported
+for I/O of String type.
+
+* In a multi-GPU environment, a potential runtime issue can occur when using
+[Tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html)
+to generate a
+[TorchScript](https://pytorch.org/docs/stable/jit.html) model. This issue
+arises due to a device mismatch between the model instance and the tensor. By
+default, Triton creates a single execution instance of the model for each
+available GPU. The runtime error occurs when a request is sent to a model
+instance with a different GPU device from the one used during the TorchScript
+generation process. To address this problem, it is highly recommended to use
+[Scripting](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script)
+instead of Tracing for model generation in a multi-GPU environment. Scripting
+avoids the device mismatch issue and ensures compatibility with different GPUs
+when used with Triton. However, if using Tracing is unavoidable, there is a
+workaround available. You can explicitly specify the GPU device for the model
+instance in the
+[model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
+to ensure that the model instance and the tensors used for inference are
+assigned to the same GPU device as on which the model was traced.

From 0b732df52c2900d68250df5785dd3f90c6534144 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <iman.tabrizian@gmail.com>
Date: Mon, 15 May 2023 13:56:50 -0400
Subject: [PATCH 33/76] Enable inference mode by default (#105)

---
 README.md       |  2 +-
 src/libtorch.cc | 28 +++++++++++++++-------------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index dfef361..17bf056 100644
--- a/README.md
+++ b/README.md
@@ -123,7 +123,7 @@ key: "DISABLE_OPTIMIZED_EXECUTION"
 ```
 
 * `INFERENCE_MODE`: Boolean flag to enable the Inference Mode execution
-of TorchScript models. By default, the inference mode is disabled.
+of TorchScript models. By default, the inference mode is enabled.
 
 [InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new
 RAII guard analogous to NoGradMode to be used when you are certain your operations
diff --git a/src/libtorch.cc b/src/libtorch.cc
index 7e0d288..331b554 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -176,7 +176,7 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model), enable_optimized_execution_(true),
-      enable_inference_mode_(false), enable_cache_cleaning_(false),
+      enable_inference_mode_(true), enable_cache_cleaning_(false),
       enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
       enable_jit_executor_pair_({false, true}),
@@ -1312,12 +1312,12 @@ ModelInstanceState::Execute(
         torch::jit::overrideCanFuseOnCPU(false);
         torch::jit::overrideCanFuseOnGPU(false);
         torch::jit::setTensorExprFuserEnabled(false);
-	torch::jit::fuser::cuda::setEnabled(true);
+        torch::jit::fuser::cuda::setEnabled(true);
       } else {
         torch::jit::overrideCanFuseOnCPU(true);
         torch::jit::overrideCanFuseOnGPU(true);
         torch::jit::setTensorExprFuserEnabled(true);
-	torch::jit::fuser::cuda::setEnabled(false);
+        torch::jit::fuser::cuda::setEnabled(false);
       }
     }
 
@@ -1761,9 +1761,9 @@ ModelInstanceState::SetInputTensors(
 
         batchn_shape[0] += GetElementCount(input_shape, input_dims_count);
       }
-    }
-    else {
-      batchn_shape = std::vector<int64_t>(input_shape, input_shape + input_dims_count);
+    } else {
+      batchn_shape =
+          std::vector<int64_t>(input_shape, input_shape + input_dims_count);
       if (supports_batching_) {
         batchn_shape[0] = total_batch_size;
       }
@@ -1887,9 +1887,11 @@ ModelInstanceState::ReadOutputTensors(
 
       // Output tensors may not reside on the same device as model
       torch::Device tensor_device = output_flat.device();
-      const auto memory_type = (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU
-                                                  : TRITONSERVER_MEMORY_GPU;
-      const auto memory_id = (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index();
+      const auto memory_type = (tensor_device.type() == torch::kCPU)
+                                   ? TRITONSERVER_MEMORY_CPU
+                                   : TRITONSERVER_MEMORY_GPU;
+      const auto memory_id =
+          (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index();
 
       // Batch output doesn't support string data type yet, as it is not trivial
       // to parse string output
@@ -1906,16 +1908,16 @@ ModelInstanceState::ReadOutputTensors(
           return TRITONSERVER_ErrorNew(
               TRITONSERVER_ERROR_INVALID_ARG,
               (std::string("output '") + name +
-              "' is a scalar which is not supported.")
+               "' is a scalar which is not supported.")
                   .c_str());
         }
 
         responder.ProcessTensor(
-            name, output_dtype, batchn_shape, output_buffer,
-            memory_type, memory_id);
+            name, output_dtype, batchn_shape, output_buffer, memory_type,
+            memory_id);
       } else {
         responder.ProcessBatchOutput(
-          name, *batch_output, output_buffer, memory_type, memory_id);
+            name, *batch_output, output_buffer, memory_type, memory_id);
       }
     } else if (output_tensors[op_index].isList()) {
       // Custom handling for string/bytes tensor...

From ead0e23b432799ca6d0c13d6dfbe2fab904f96e0 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Fri, 5 May 2023 14:09:54 -0700
Subject: [PATCH 34/76] Update pythong package destination path

---
 CMakeLists.txt | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91e2489..443a70a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,20 +208,20 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true
     COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE}
     COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/usr/local/lib/$i $i ; done"
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libc10.so libc10.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libc10_cuda.so libc10_cuda.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch.so libtorch.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libnvfuser_codegen.so libnvfuser_codegen.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so libc10.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so libc10_cuda.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch.so libtorch.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libnvfuser_codegen.so libnvfuser_codegen.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so
-    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.8/dist-packages/torch/include include/torch
+    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/include include/torch
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/.
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_videoio.so libopencv_videoio.so

From f4054888af946a656954c121024be29392267493 Mon Sep 17 00:00:00 2001
From: Markus Hennerbichler <markushennerbichler@gmail.com>
Date: Tue, 6 Jun 2023 00:57:09 +0100
Subject: [PATCH 35/76] Add support for batch_input (#98)

---
 src/libtorch.cc       | 127 +++++++++++++++++++++++++++++++-----------
 src/libtorch_utils.cc |   2 +-
 2 files changed, 97 insertions(+), 32 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 331b554..9cccbbd 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -25,7 +25,10 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdint.h>
+
+#include <cstdint>
 #include <exception>
+
 #include "libtorch_utils.h"
 #include "triton/backend/backend_common.h"
 #include "triton/backend/backend_input_collector.h"
@@ -502,6 +505,10 @@ class ModelInstanceState : public BackendModelInstance {
       triton::common::TritonJson::Value& sequence_batching,
       const std::string& control_kind, bool required, bool* have_control);
   TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt);
+  void AddInputToMap(
+      NamingConvention naming_convention,
+      const std::vector<std::string> allowed_inputs, const std::string& io_name,
+      const uint32_t index);
   TRITONSERVER_Error* ValidateOutputs();
   void Execute(
       std::vector<TRITONBACKEND_Response*>* responses,
@@ -538,6 +545,7 @@ class ModelInstanceState : public BackendModelInstance {
   // Map from configuration name for an input to the index of
   // that input in the model.
   std::unordered_map<std::string, int> input_index_map_;
+  uint32_t batch_input_count_ = 0;
 
   // Map from configuration name for an output to the index of
   // that output in the model.
@@ -607,6 +615,12 @@ ModelInstanceState::ModelInstanceState(
     if (model_state->ModelConfig().Find("input", &inputs)) {
       expected_input_cnt = inputs.ArraySize();
     }
+
+    triton::common::TritonJson::Value config_batch_inputs;
+    if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) {
+      batch_input_count_ = config_batch_inputs.ArraySize();
+      expected_input_cnt += batch_input_count_;
+    }
   }
 
   // If this is a sequence model then make sure that the required
@@ -757,6 +771,43 @@ ModelInstanceState::ValidateTypedSequenceControl(
   return nullptr;  // success
 }
 
+void
+ModelInstanceState::AddInputToMap(
+    NamingConvention naming_convention,
+    const std::vector<std::string> allowed_inputs, const std::string& io_name,
+    const uint32_t index)
+{
+  std::string deliminator = "__";
+
+  if (is_dict_input_) {
+    // If dictionary, index is irrelevant but we use the map to store the
+    // input names since they are the keys for the dictionary
+    input_index_map_[io_name] = index;
+  } else {
+    switch (naming_convention) {
+      case NamingConvention::FORWARD_ARGUMENT: {
+        auto itr =
+            std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name);
+        if (itr != allowed_inputs.end()) {
+          input_index_map_[io_name] =
+              std::distance(allowed_inputs.begin(), itr);
+        }
+        return;
+      }
+      case NamingConvention::NAMED_INDEX: {
+        int start_pos = io_name.find(deliminator);
+        int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str());
+        input_index_map_[io_name] = ip_index;
+        return;
+      }
+      case NamingConvention::STRICT_CONFIG_ORDERING: {
+        input_index_map_[io_name] = index;
+        return;
+      }
+    }
+  }
+}
+
 TRITONSERVER_Error*
 ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
 {
@@ -822,8 +873,6 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
 
   triton::common::TritonJson::Value ios;
   RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios));
-  std::string deliminator = "__";
-  int ip_index = 0;
 
   if (ios.ArraySize() == 0) {
     return TRITONSERVER_ErrorNew(
@@ -842,34 +891,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
     // Validate name
     std::string io_name;
     RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-    if (is_dict_input_) {
-      // If dictionary, index is irrelevant but we use the map to store the
-      // input names since they are the keys for the dictionary
-      input_index_map_[io_name] = i;
-    } else {
-      switch (naming_convention) {
-        case NamingConvention::FORWARD_ARGUMENT: {
-          auto itr =
-              std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name);
-          if (itr != allowed_inputs.end()) {
-            input_index_map_[io_name] =
-                std::distance(allowed_inputs.begin(), itr);
-          }
-          break;
-        }
-        case NamingConvention::NAMED_INDEX: {
-          int start_pos = io_name.find(deliminator);
-          ip_index = std::atoi(io_name.substr(start_pos + 2).c_str());
-          input_index_map_[io_name] = ip_index;
-          break;
-        }
-        case NamingConvention::STRICT_CONFIG_ORDERING: {
-          input_index_map_[io_name] = i;
-          break;
-        }
-      }
-    }
-
+    AddInputToMap(naming_convention, allowed_inputs, io_name, i);
     // Validate data type
     std::string io_dtype;
     RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
@@ -906,6 +928,18 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
     }
   }
 
+  triton::common::TritonJson::Value batch_inputs;
+  RETURN_IF_ERROR(
+      model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs));
+  size_t i = 0;
+  for (const auto& batch_input : StateForModel()->BatchInputs()) {
+    for (const auto& input_name : batch_input.TargetNames()) {
+      AddInputToMap(
+          naming_convention, allowed_inputs, input_name, i + ios.ArraySize());
+      i++;
+    }
+  }
+
   return nullptr;  // success
 }
 
@@ -1725,7 +1759,8 @@ ModelInstanceState::SetInputTensors(
   // request as the representative for the input tensors.
   uint32_t input_count;
   RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count));
-  input_tensors->resize(input_count);
+
+  input_tensors->resize(input_count + batch_input_count_);
   for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
     TRITONBACKEND_Input* input;
     RETURN_IF_ERROR(
@@ -1828,6 +1863,36 @@ ModelInstanceState::SetInputTensors(
     }
   }
 
+  for (const auto& batch_input : StateForModel()->BatchInputs()) {
+    std::vector<int64_t> shape;
+    collector->BatchInputShape(batch_input, &shape);
+
+    for (const auto& input_name : batch_input.TargetNames()) {
+      input_names->emplace_back(input_name.c_str());
+
+      const char* dst_buffer;
+      size_t dst_buffer_byte_size;
+      TRITONSERVER_MemoryType dst_memory_type;
+      int64_t dst_memory_type_id;
+
+      // Batch inputs are always created on CPU
+      RESPOND_ALL_AND_SET_NULL_IF_ERROR(
+          (*responses), responses->size(),
+          collector->ProcessBatchInput(
+              batch_input, nullptr, 0, {{TRITONSERVER_MEMORY_CPU, 0}},
+              &dst_buffer, &dst_buffer_byte_size, &dst_memory_type,
+              &dst_memory_type_id));
+
+      const auto torch_dtype =
+          ConvertDataTypeToTorchType(batch_input.DataType());
+
+      torch::Tensor input_tensor = torch::from_blob(
+          const_cast<char*>(dst_buffer), shape,
+          updated_options.dtype(torch_dtype.second));
+      (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+    }
+  }
+
   // Finalize...
   *cuda_copy |= collector->Finalize();
 
diff --git a/src/libtorch_utils.cc b/src/libtorch_utils.cc
index 699c742..49c13aa 100644
--- a/src/libtorch_utils.cc
+++ b/src/libtorch_utils.cc
@@ -152,7 +152,7 @@ ParseParameter(
 #ifdef TRITON_ENABLE_GPU
 TRITONSERVER_Error*
 ConvertCUDAStatusToTritonError(
-   cudaError_t cuda_error,TRITONSERVER_Error_Code code, const char* msg)
+    cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg)
 {
   if (cuda_error != cudaSuccess) {
     return TRITONSERVER_ErrorNew(

From fd29c6e0c28a554d294f53fa44fa05c5c04bfbb2 Mon Sep 17 00:00:00 2001
From: dyastremsky <58150256+dyastremsky@users.noreply.github.com>
Date: Tue, 6 Jun 2023 11:58:24 -0700
Subject: [PATCH 36/76] Fix updated_options (#111)

* Fix updated_options

* Add options init
---
 src/libtorch.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 9cccbbd..f4b5509 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1885,6 +1885,10 @@ ModelInstanceState::SetInputTensors(
 
       const auto torch_dtype =
           ConvertDataTypeToTorchType(batch_input.DataType());
+      torch::TensorOptions options{torch_dtype.second};
+      auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU)
+                                 ? options.device(torch::kCUDA, device_.index())
+                                 : options.device(torch::kCPU);
 
       torch::Tensor input_tensor = torch::from_blob(
           const_cast<char*>(dst_buffer), shape,

From bfca5d0de81534ac2f6a5e19f2431154f4eebeea Mon Sep 17 00:00:00 2001
From: Markus Hennerbichler <markush@speechmatics.com>
Date: Sat, 8 Apr 2023 13:25:32 +0100
Subject: [PATCH 37/76] Add support for batch_input

---
 src/libtorch.cc | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index f4b5509..5798246 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -25,7 +25,6 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdint.h>
-
 #include <cstdint>
 #include <exception>
 
@@ -506,8 +505,9 @@ class ModelInstanceState : public BackendModelInstance {
       const std::string& control_kind, bool required, bool* have_control);
   TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt);
   void AddInputToMap(
-      NamingConvention naming_convention,
-      const std::vector<std::string> allowed_inputs, const std::string& io_name,
+      NamingConvention naming_convention, 
+      const std::vector<std::string> allowed_inputs, 
+      const std::string &io_name,
       const uint32_t index);
   TRITONSERVER_Error* ValidateOutputs();
   void Execute(
@@ -771,12 +771,7 @@ ModelInstanceState::ValidateTypedSequenceControl(
   return nullptr;  // success
 }
 
-void
-ModelInstanceState::AddInputToMap(
-    NamingConvention naming_convention,
-    const std::vector<std::string> allowed_inputs, const std::string& io_name,
-    const uint32_t index)
-{
+void ModelInstanceState::AddInputToMap(NamingConvention naming_convention, const std::vector<std::string> allowed_inputs, const std::string &io_name, const uint32_t index) {
   std::string deliminator = "__";
 
   if (is_dict_input_) {
@@ -929,13 +924,11 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
   }
 
   triton::common::TritonJson::Value batch_inputs;
-  RETURN_IF_ERROR(
-      model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs));
+  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs));
   size_t i = 0;
   for (const auto& batch_input : StateForModel()->BatchInputs()) {
     for (const auto& input_name : batch_input.TargetNames()) {
-      AddInputToMap(
-          naming_convention, allowed_inputs, input_name, i + ios.ArraySize());
+      AddInputToMap(naming_convention, allowed_inputs, input_name, i + ios.ArraySize());
       i++;
     }
   }
@@ -1883,16 +1876,12 @@ ModelInstanceState::SetInputTensors(
               &dst_buffer, &dst_buffer_byte_size, &dst_memory_type,
               &dst_memory_type_id));
 
-      const auto torch_dtype =
-          ConvertDataTypeToTorchType(batch_input.DataType());
+      const auto torch_dtype = ConvertDataTypeToTorchType(batch_input.DataType());
       torch::TensorOptions options{torch_dtype.second};
-      auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU)
-                                 ? options.device(torch::kCUDA, device_.index())
-                                 : options.device(torch::kCPU);
+      auto updated_options = options.device(torch::kCPU);
 
       torch::Tensor input_tensor = torch::from_blob(
-          const_cast<char*>(dst_buffer), shape,
-          updated_options.dtype(torch_dtype.second));
+          const_cast<char*>(dst_buffer), shape, updated_options);
       (*input_tensors)[input_index_map_[input_name]] = input_tensor;
     }
   }

From fe45fb02aa5778ff057bcb841c9c29083964e59f Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 25 May 2023 13:21:43 -0700
Subject: [PATCH 38/76] Auto-format, create batch input on current device

---
 src/libtorch.cc | 46 ++++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 5798246..d607604 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -505,9 +505,8 @@ class ModelInstanceState : public BackendModelInstance {
       const std::string& control_kind, bool required, bool* have_control);
   TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt);
   void AddInputToMap(
-      NamingConvention naming_convention, 
-      const std::vector<std::string> allowed_inputs, 
-      const std::string &io_name,
+      NamingConvention naming_convention,
+      const std::vector<std::string> allowed_inputs, const std::string& io_name,
       const uint32_t index);
   TRITONSERVER_Error* ValidateOutputs();
   void Execute(
@@ -771,7 +770,12 @@ ModelInstanceState::ValidateTypedSequenceControl(
   return nullptr;  // success
 }
 
-void ModelInstanceState::AddInputToMap(NamingConvention naming_convention, const std::vector<std::string> allowed_inputs, const std::string &io_name, const uint32_t index) {
+void
+ModelInstanceState::AddInputToMap(
+    NamingConvention naming_convention,
+    const std::vector<std::string> allowed_inputs, const std::string& io_name,
+    const uint32_t index)
+{
   std::string deliminator = "__";
 
   if (is_dict_input_) {
@@ -924,11 +928,13 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
   }
 
   triton::common::TritonJson::Value batch_inputs;
-  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs));
+  RETURN_IF_ERROR(
+      model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs));
   size_t i = 0;
   for (const auto& batch_input : StateForModel()->BatchInputs()) {
     for (const auto& input_name : batch_input.TargetNames()) {
-      AddInputToMap(naming_convention, allowed_inputs, input_name, i + ios.ArraySize());
+      AddInputToMap(
+          naming_convention, allowed_inputs, input_name, i + ios.ArraySize());
       i++;
     }
   }
@@ -1754,6 +1760,16 @@ ModelInstanceState::SetInputTensors(
   RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count));
 
   input_tensors->resize(input_count + batch_input_count_);
+
+  // The inputs must be in contiguous CPU/GPU memory.
+  std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
+  if (device_.is_cpu()) {
+    alloc_perference = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
+                        {TRITONSERVER_MEMORY_CPU, 0}};
+  } else {
+    alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
+  }
+
   for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
     TRITONBACKEND_Input* input;
     RETURN_IF_ERROR(
@@ -1797,15 +1813,6 @@ ModelInstanceState::SetInputTensors(
       }
     }
 
-    // The input must be in contiguous CPU/GPU memory.
-    std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
-    if (device_.is_cpu()) {
-      alloc_perference = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
-                          {TRITONSERVER_MEMORY_CPU, 0}};
-    } else {
-      alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
-    }
-
     const char* input_buffer;
     size_t batchn_byte_size;
     TRITONSERVER_MemoryType memory_type;
@@ -1868,15 +1875,14 @@ ModelInstanceState::SetInputTensors(
       TRITONSERVER_MemoryType dst_memory_type;
       int64_t dst_memory_type_id;
 
-      // Batch inputs are always created on CPU
       RESPOND_ALL_AND_SET_NULL_IF_ERROR(
           (*responses), responses->size(),
           collector->ProcessBatchInput(
-              batch_input, nullptr, 0, {{TRITONSERVER_MEMORY_CPU, 0}},
-              &dst_buffer, &dst_buffer_byte_size, &dst_memory_type,
-              &dst_memory_type_id));
+              batch_input, nullptr, 0, alloc_perference, &dst_buffer,
+              &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id));
 
-      const auto torch_dtype = ConvertDataTypeToTorchType(batch_input.DataType());
+      const auto torch_dtype =
+          ConvertDataTypeToTorchType(batch_input.DataType());
       torch::TensorOptions options{torch_dtype.second};
       auto updated_options = options.device(torch::kCPU);
 

From fe0f318cd67ecdfb7931053e32318694db527369 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 30 May 2023 11:47:29 -0700
Subject: [PATCH 39/76] Use original device for created batch inputs

---
 src/libtorch.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index d607604..fe3a0ed 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1884,7 +1884,9 @@ ModelInstanceState::SetInputTensors(
       const auto torch_dtype =
           ConvertDataTypeToTorchType(batch_input.DataType());
       torch::TensorOptions options{torch_dtype.second};
-      auto updated_options = options.device(torch::kCPU);
+      auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU)
+                               ? options.device(torch::kCUDA, device_.index())
+                               : options.device(torch::kCPU);
 
       torch::Tensor input_tensor = torch::from_blob(
           const_cast<char*>(dst_buffer), shape, updated_options);

From 550cf62746a96c22218ac275c28c7a2cb57ed9a6 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 30 May 2023 12:43:10 -0700
Subject: [PATCH 40/76] Fix variable name

---
 src/libtorch.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index fe3a0ed..578809f 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1884,7 +1884,7 @@ ModelInstanceState::SetInputTensors(
       const auto torch_dtype =
           ConvertDataTypeToTorchType(batch_input.DataType());
       torch::TensorOptions options{torch_dtype.second};
-      auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU)
+      auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU)
                                ? options.device(torch::kCUDA, device_.index())
                                : options.device(torch::kCPU);
 

From 83d2adaadef26964075f13187b320608232b053b Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Mon, 12 Jun 2023 22:42:54 -0700
Subject: [PATCH 41/76] Add support for instance group of type 'MODEL' (#107)

* Add support for instance group of type 'MODEL'

* Format

* Handle multi GPU cases when recording timestamps

* Address comment

* Use callback function to record timestamp for 'MODEL' kind

* Add missing #ifdef

* Update comment and if condition for input tensor memory alloc_perference

* Fix for cuda stream. Use separate cuda callback to capture timestamp

* Add comment to mention the possible timestamp issue

* For 'KIND_MODEL', use cuda events for compute_input_duration and use callback for compute_infer_duration

* Move the cudaLaunchHostFunc from RecordBackendTimestamp function

* Fix up naming

* Fix up

* Fix up atomic initialization

* Capture the timestamp after synchronization
---
 src/libtorch.cc | 307 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 242 insertions(+), 65 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 578809f..43b0c2a 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -80,7 +80,7 @@ class ModelState : public BackendModel {
   // representing the model.
   TRITONSERVER_Error* LoadModel(
       const std::string& artifact_name, const torch::Device device,
-      std::string* model_path,
+      std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
       std::shared_ptr<torch::jit::script::Module>* torch_model);
 
   bool EnabledOptimizedExecution() { return enable_optimized_execution_; }
@@ -205,7 +205,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
 TRITONSERVER_Error*
 ModelState::LoadModel(
     const std::string& artifact_name, const torch::Device device,
-    std::string* model_path,
+    std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
     std::shared_ptr<torch::jit::script::Module>* torch_model)
 {
   // Find the TorchScript file that describes the model. If the model
@@ -255,8 +255,14 @@ ModelState::LoadModel(
 
   try {
     std::istringstream model_stream(model_data_str);
-    torch_model->reset(
-        new torch::jit::Module(torch::jit::load(model_stream, device)));
+    if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+      // Load the model without selecting a device.
+      torch_model->reset(
+          new torch::jit::Module(torch::jit::load(model_stream)));
+    } else {
+      torch_model->reset(
+          new torch::jit::Module(torch::jit::load(model_stream, device)));
+    }
   }
   catch (const std::exception& ex) {
     return TRITONSERVER_ErrorNew(
@@ -533,6 +539,22 @@ class ModelInstanceState : public BackendModelInstance {
       NamingConvention* naming_convention,
       const std::vector<std::string>& allowed_io);
 
+  // Create CUDA events for statistics collection.
+  void CreateCudaEvents(const int32_t& device_id);
+
+  // Get the appropriate CUDA stream for input and output handling based on the
+  // instance group type.
+  cudaStream_t GetCudaStreamByInstanceKind();
+
+  // Replace the default CUDA stream with the stream we created to ensure proper
+  // cuda stream synchronization.
+  void SetCurrentCudaStream(
+      const cudaStream_t& stream, const int32_t& device_id);
+
+  // Get the elapsed time between two CUDA events.
+  float GetCudaEventElapsedTime(
+      const cudaEvent_t& start_event, const cudaEvent_t& end_event);
+
   ModelState* model_state_;
 
   // The full path to the TorchScript model file.
@@ -560,6 +582,12 @@ class ModelInstanceState : public BackendModelInstance {
   cudaEvent_t compute_input_start_event_;
   cudaEvent_t compute_infer_start_event_;
   cudaEvent_t compute_output_start_event_;
+
+  // Store the cuda streams created for the 'KIND_MODEL' instance group.
+  std::vector<cudaStream_t> stream_vec_;
+
+  // The number of available devices.
+  int device_cnt_;
 };
 
 TRITONSERVER_Error*
@@ -583,30 +611,46 @@ ModelInstanceState::Create(
 ModelInstanceState::ModelInstanceState(
     ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
     : BackendModelInstance(model_state, triton_model_instance),
-      model_state_(model_state), device_(torch::kCPU), is_dict_input_(false)
+      model_state_(model_state), device_(torch::kCPU), is_dict_input_(false),
+      device_cnt_(0)
 {
   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 #ifdef TRITON_ENABLE_GPU
     device_ = torch::Device(torch::kCUDA, DeviceId());
-    // Need to set the CUDA context so that the context that events are
-    // created on match with contexts that events are recorded with.
-    THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
-        cudaSetDevice(DeviceId()), TRITONSERVER_ERROR_INTERNAL,
-        "Failed to set the device"));
-    THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
-        cudaEventCreate(&compute_input_start_event_),
-        TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event"));
-    THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
-        cudaEventCreate(&compute_infer_start_event_),
-        TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event"));
-    THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
-        cudaEventCreate(&compute_output_start_event_),
-        TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event"));
+    CreateCudaEvents(DeviceId());
 #endif
   }
 
+#ifdef TRITON_ENABLE_GPU
+  device_cnt_ = torch::cuda::device_count();
+#endif
+
   THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(
-      ArtifactFilename(), device_, &model_path_, &torch_model_));
+      ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_));
+
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+#ifdef TRITON_ENABLE_GPU
+    // Since we cannot determine the exact devices used by the model, we create
+    // a CUDA stream for every available device to ensure proper synchronization
+    // of CUDA streams. This approach may have implications when a timestamp is
+    // captured on a device that is not used by the model. Currently, this issue
+    // is addressed by synchronizing the CUDA streams before recording
+    // timestamps to prevent timestamp skewing. However, in the future, any
+    // modifications to the CUDA stream synchronization logic should be handled
+    // with caution.
+    for (int i = 0; i < device_cnt_; i++) {
+      cudaStream_t stream;
+      THROW_IF_BACKEND_INSTANCE_ERROR(
+          CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream));
+      stream_vec_.push_back(stream);
+    }
+    if (!stream_vec_.empty()) {
+      // Create CUDA events on the first device that will be used for collecting
+      // inputs/outputs.
+      CreateCudaEvents(0);
+    }
+#endif
+  }
 
   size_t expected_input_cnt = 0;
   {
@@ -664,7 +708,8 @@ void
 ModelInstanceState::ClearCache()
 {
 #ifdef TRITON_ENABLE_GPU
-  if (device_.is_cuda()) {
+  if (device_.is_cuda() ||
+      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
     c10::cuda::CUDACachingAllocator::emptyCache();
   }
 #endif  // TRITON_ENABLE_GPU
@@ -674,6 +719,25 @@ ModelInstanceState::~ModelInstanceState()
 {
   torch_model_.reset();
   ClearCache();
+
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+#ifdef TRITON_ENABLE_GPU
+    for (size_t i = 0; i < stream_vec_.size(); i++) {
+      LOG_IF_ERROR(
+          ConvertCUDAStatusToTritonError(
+              cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL,
+              "Failed to set the device"),
+          "Failed to set the device");
+
+      LOG_IF_ERROR(
+          ConvertCUDAStatusToTritonError(
+              cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL,
+              "Failed to destroy cuda stream"),
+          "~ModelInstanceState error: ");
+      stream_vec_[i] = nullptr;
+    }
+#endif
+  }
 }
 
 TRITONSERVER_Error*
@@ -1033,13 +1097,16 @@ ModelInstanceState::ProcessRequests(
        std::to_string(request_count) + " requests")
           .c_str());
 
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 #ifdef TRITON_ENABLE_GPU
-    at::cuda::CUDAStream torch_stream =
-        at::cuda::getStreamFromExternal(stream_, DeviceId());
-    at::cuda::setCurrentCUDAStream(torch_stream);
-#endif
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+    SetCurrentCudaStream(stream_, DeviceId());
+  } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+    // Replace the default stream of each device with the one we created.
+    for (size_t i = 0; i < stream_vec_.size(); i++) {
+      SetCurrentCudaStream(stream_vec_[i], i);
+    }
   }
+#endif
 
   NVTX_RANGE(nvtx_, "ProcessRequests " + Name());
 
@@ -1145,12 +1212,17 @@ ModelInstanceState::ProcessRequests(
   std::vector<torch::jit::IValue> input_tensors;
   bool cuda_copy = false;
   std::unique_ptr<BackendInputCollector> collector;
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+
+  // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute
+  // input duration since only one stream will be used for input collection.
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
+      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
 #ifdef TRITON_ENABLE_GPU
     RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
         responses, request_count, all_response_failed,
         ConvertCUDAStatusToTritonError(
-            cudaEventRecord(compute_input_start_event_, stream_),
+            cudaEventRecord(
+                compute_input_start_event_, GetCudaStreamByInstanceKind()),
             TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
 #endif
   }
@@ -1159,7 +1231,8 @@ ModelInstanceState::ProcessRequests(
     collector.reset(new BackendInputCollector(
         requests, request_count, &responses,
         model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(),
-        CudaStream(), nullptr, nullptr, 0, HostPolicyName().c_str()));
+        GetCudaStreamByInstanceKind(), nullptr, nullptr, 0,
+        HostPolicyName().c_str()));
     RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
         responses, request_count, all_response_failed,
         SetInputTensors(
@@ -1169,13 +1242,14 @@ ModelInstanceState::ProcessRequests(
 
 #ifdef TRITON_ENABLE_GPU
   if (cuda_copy) {
-    cudaStreamSynchronize(stream_);
+    cudaStreamSynchronize(GetCudaStreamByInstanceKind());
     cuda_copy = false;
   }
 #endif
 
   std::vector<torch::jit::IValue> output_tensors;
   uint64_t compute_start_ns = 0;
+  uint64_t compute_infer_start = 0;
 
   RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
       responses, request_count, all_response_failed,
@@ -1183,6 +1257,11 @@ ModelInstanceState::ProcessRequests(
           &compute_start_ns,
           reinterpret_cast<void*>(&compute_infer_start_event_)));
 
+  // For 'KIND_MODEL', capture the timestamp for the compute infer duration.
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
+    SET_TIMESTAMP(compute_infer_start);
+  }
+
   // Run...
   if (!all_response_failed) {
     Execute(&responses, request_count, &input_tensors, &output_tensors);
@@ -1212,12 +1291,30 @@ ModelInstanceState::ProcessRequests(
     }
   }
 
+#ifdef TRITON_ENABLE_GPU
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+    // For 'KIND_MODEL', multiple streams will be involved, so we need to call
+    // 'cudaStreamSynchronize' before reading the output tensors.
+    for (auto& stream : stream_vec_) {
+      cudaStreamSynchronize(stream);
+    }
+  }
+#endif
+
   uint64_t compute_end_ns = 0;
-  RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-      responses, request_count, all_response_failed,
-      RecordBackendTimestamp(
-          &compute_end_ns,
-          reinterpret_cast<void*>(&compute_output_start_event_)));
+  uint64_t compute_output_start = 0;
+
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
+#ifdef TRITON_ENABLE_GPU
+    SET_TIMESTAMP(compute_output_start);
+#endif
+  } else {
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        RecordBackendTimestamp(
+            &compute_end_ns,
+            reinterpret_cast<void*>(&compute_output_start_event_)));
+  }
 
   if (!all_response_failed) {
     if (!invalid_index) {
@@ -1249,28 +1346,24 @@ ModelInstanceState::ProcessRequests(
   // synchronized the stream in the ReadOutputTensors function.
   if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
 #ifdef TRITON_ENABLE_GPU
-    // [FIXME] in the case of cudaEventElapsedTime failure, should handle
-    // stats reporting more gracefully as the durations are inaccurate
-    float compute_input_duration = 0;
-    float compute_infer_duration = 0;
-    LOG_IF_ERROR(
-        ConvertCUDAStatusToTritonError(
-            cudaEventElapsedTime(
-                &compute_input_duration, compute_input_start_event_,
-                compute_infer_start_event_),
-            TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
-        "Failed to capture elapsed time");
-
-    LOG_IF_ERROR(
-        ConvertCUDAStatusToTritonError(
-            cudaEventElapsedTime(
-                &compute_infer_duration, compute_infer_start_event_,
-                compute_output_start_event_),
-            TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
-        "Failed to capture elapsed time");
+    float compute_input_duration = GetCudaEventElapsedTime(
+        compute_input_start_event_, compute_infer_start_event_);
+    float compute_infer_duration = GetCudaEventElapsedTime(
+        compute_infer_start_event_, compute_output_start_event_);
 
     compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
     compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6);
+#endif
+  } else if (
+      (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
+#ifdef TRITON_ENABLE_GPU
+    float compute_input_duration = GetCudaEventElapsedTime(
+        compute_input_start_event_, compute_infer_start_event_);
+    uint64_t compute_infer_duration =
+        compute_output_start - compute_infer_start;
+
+    compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
+    compute_end_ns = compute_start_ns + compute_infer_duration;
 #endif
   }
 
@@ -1340,8 +1433,11 @@ ModelInstanceState::Execute(
 
     // NV-Fuser. No change is made unless parameter is explicitly set.
     if (std::get<0>(model_state_->EnabledNvfuserPair())) {
-      if (std::get<1>(model_state_->EnabledNvfuserPair()) &&
-          (device_.type() != torch::kCPU)) {
+      bool is_device_gpu =
+          (device_.is_cuda() ||
+           ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) &&
+            (device_cnt_ > 0)));
+      if (std::get<1>(model_state_->EnabledNvfuserPair()) && is_device_gpu) {
         torch::jit::overrideCanFuseOnCPU(false);
         torch::jit::overrideCanFuseOnGPU(false);
         torch::jit::setTensorExprFuserEnabled(false);
@@ -1813,6 +1909,17 @@ ModelInstanceState::SetInputTensors(
       }
     }
 
+    // The input must be in contiguous CPU/GPU memory.
+    std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
+    // For 'KIND_MODEL', input will always be in CPU as we don't have a way to
+    // query the input types.
+    if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) {
+      alloc_perference = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
+                          {TRITONSERVER_MEMORY_CPU, 0}};
+    } else {
+      alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
+    }
+
     const char* input_buffer;
     size_t batchn_byte_size;
     TRITONSERVER_MemoryType memory_type;
@@ -1851,7 +1958,8 @@ ModelInstanceState::SetInputTensors(
 
         *cuda_copy |= SetStringInputTensor(
             &input_list, input, input_name, buffer_count, batch_element_cnt,
-            &((*responses)[idx]), CudaStream(), HostPolicyName().c_str());
+            &((*responses)[idx]), GetCudaStreamByInstanceKind(),
+            HostPolicyName().c_str());
       }
 
       (*input_tensors)[input_index_map_[input_name]] = input_list;
@@ -1885,8 +1993,8 @@ ModelInstanceState::SetInputTensors(
           ConvertDataTypeToTorchType(batch_input.DataType());
       torch::TensorOptions options{torch_dtype.second};
       auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU)
-                               ? options.device(torch::kCUDA, device_.index())
-                               : options.device(torch::kCPU);
+                                 ? options.device(torch::kCUDA, device_.index())
+                                 : options.device(torch::kCPU);
 
       torch::Tensor input_tensor = torch::from_blob(
           const_cast<char*>(dst_buffer), shape, updated_options);
@@ -1912,7 +2020,7 @@ ModelInstanceState::ReadOutputTensors(
   BackendOutputResponder responder(
       requests, request_count, responses, model_state_->TritonMemoryManager(),
       model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(),
-      CudaStream());
+      GetCudaStreamByInstanceKind());
 
   bool cuda_copy = false;
   // The serialized string buffer must be valid until output copies are done
@@ -2019,7 +2127,7 @@ ModelInstanceState::ReadOutputTensors(
           string_buffer.emplace_back(new std::string());
           cuda_copy |= SetStringOutputBuffer(
               &output_list, &response, response_output, tensor_element_cnt,
-              CudaStream(), string_buffer.back().get());
+              GetCudaStreamByInstanceKind(), string_buffer.back().get());
         }
       }
     } else {
@@ -2039,7 +2147,7 @@ ModelInstanceState::ReadOutputTensors(
   // the events on the cuda stream are synchronized. Otherwise, the events
   // are only guaranteed to be synchronized if the model provides the output
   // on GPU.
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(GetCudaStreamByInstanceKind());
 #endif
 
   return nullptr;
@@ -2049,12 +2157,13 @@ TRITONSERVER_Error*
 ModelInstanceState::RecordBackendTimestamp(
     uint64_t* timestamp, void* cuda_event)
 {
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
+      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
 #ifdef TRITON_ENABLE_GPU
     cudaEvent_t* lcuda_event = reinterpret_cast<cudaEvent_t*>(cuda_event);
     RETURN_IF_ERROR(ConvertCUDAStatusToTritonError(
-        cudaEventRecord(*lcuda_event, stream_), TRITONSERVER_ERROR_INTERNAL,
-        "Failed to record the event."));
+        cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()),
+        TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
 #endif
   } else {
     SET_TIMESTAMP(*timestamp);
@@ -2062,6 +2171,74 @@ ModelInstanceState::RecordBackendTimestamp(
   return nullptr;
 }
 
+void
+ModelInstanceState::CreateCudaEvents(const int32_t& device_id)
+{
+#ifdef TRITON_ENABLE_GPU
+  // Need to set the CUDA context so that the context that events are
+  // created on match with contexts that events are recorded with.
+  THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
+      cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL,
+      "Failed to set the device"));
+  THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
+      cudaEventCreate(&compute_input_start_event_), TRITONSERVER_ERROR_INTERNAL,
+      "Failed to create cuda event"));
+  THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
+      cudaEventCreate(&compute_infer_start_event_), TRITONSERVER_ERROR_INTERNAL,
+      "Failed to create cuda event"));
+  THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
+      cudaEventCreate(&compute_output_start_event_),
+      TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event"));
+#endif
+}
+
+cudaStream_t
+ModelInstanceState::GetCudaStreamByInstanceKind()
+{
+#ifdef TRITON_ENABLE_GPU
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+    return stream_;
+  } else if (
+      (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) &&
+      !stream_vec_.empty()) {
+    return stream_vec_[0];
+  }
+#endif
+  return nullptr;
+}
+
+void
+ModelInstanceState::SetCurrentCudaStream(
+    const cudaStream_t& stream, const int& device_id)
+{
+#ifdef TRITON_ENABLE_GPU
+  at::cuda::CUDAStream torch_stream =
+      at::cuda::getStreamFromExternal(stream, device_id);
+  // This function replaces the default stream with the stream we created. It
+  // is not necessary to change the current device to the desired device when
+  // replacing the default stream for that device. See the documentation here:
+  // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html
+  at::cuda::setCurrentCUDAStream(torch_stream);
+#endif
+}
+
+float
+ModelInstanceState::GetCudaEventElapsedTime(
+    const cudaEvent_t& start_event, const cudaEvent_t& end_event)
+{
+  float duration = 0;
+#ifdef TRITON_ENABLE_GPU
+  // [FIXME] in the case of cudaEventElapsedTime failure, should handle
+  // stats reporting more gracefully as the durations are inaccurate
+  LOG_IF_ERROR(
+      ConvertCUDAStatusToTritonError(
+          cudaEventElapsedTime(&duration, start_event, end_event),
+          TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
+      "Failed to capture elapsed time");
+#endif
+  return duration;
+}
+
 /////////////
 
 extern "C" {

From 19028b72b923b0d28e51a2b24d6ecff074a058ba Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Mon, 12 Jun 2023 22:43:09 -0700
Subject: [PATCH 42/76] Add documentation for instance group kind of type
 'KIND_MODEL' (#110)

* Add documentation for instance group kind of type 'KIND_MODEL'

* Address comment

* Address comment
---
 README.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/README.md b/README.md
index 17bf056..0f82395 100644
--- a/README.md
+++ b/README.md
@@ -206,6 +206,28 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
 
     `ENABLE_TENSOR_FUSER`
 
+### Support 
+
+#### Model Instance Group Kind
+
+The PyTorch backend supports the following kinds of
+[Model Instance Groups](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
+where the input tensors are placed as follows:
+
+* `KIND_GPU`: Inputs are prepared on the GPU device associated with the model
+instance.
+
+* `KIND_CPU`: Inputs are prepared on the CPU.
+
+* `KIND_MODEL`: Inputs are prepared on the CPU. When loading the model, the
+backend does not choose the GPU device for the model; instead, it respects the
+device(s) specified in the model and uses them as they are during inference.
+This is useful when the model internally utilizes multiple GPUs, as demonstrated
+in this
+[example model](https://github.com/triton-inference-server/server/blob/main/qa/L0_libtorch_instance_group_kind_model/gen_models.py).
+If no device is specified in the model, the backend uses the first available
+GPU device. This feature is available starting in the 23.06 release.
+
 ### Important Notes
 
 * The execution of PyTorch model on GPU is asynchronous in nature. See

From f3d03d975efa7e2360fc51f0a8e77fa89c888d61 Mon Sep 17 00:00:00 2001
From: dyastremsky <58150256+dyastremsky@users.noreply.github.com>
Date: Thu, 22 Jun 2023 11:32:31 -0700
Subject: [PATCH 43/76] Auto-format (#115)

---
 src/libtorch.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 43b0c2a..99732cf 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -25,6 +25,7 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdint.h>
+
 #include <cstdint>
 #include <exception>
 
@@ -1860,8 +1861,8 @@ ModelInstanceState::SetInputTensors(
   // The inputs must be in contiguous CPU/GPU memory.
   std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
   if (device_.is_cpu()) {
-    alloc_perference = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
-                        {TRITONSERVER_MEMORY_CPU, 0}};
+    alloc_perference = {
+        {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
   } else {
     alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
   }
@@ -1914,8 +1915,8 @@ ModelInstanceState::SetInputTensors(
     // For 'KIND_MODEL', input will always be in CPU as we don't have a way to
     // query the input types.
     if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) {
-      alloc_perference = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
-                          {TRITONSERVER_MEMORY_CPU, 0}};
+      alloc_perference = {
+          {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
     } else {
       alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
     }

From 00b38a9d1e7b7a43049682702adf2a82e64118c6 Mon Sep 17 00:00:00 2001
From: Jamie Dougherty <jamied157@gmail.com>
Date: Fri, 7 Jul 2023 22:58:03 +0100
Subject: [PATCH 44/76] Implicit state management (#103)

* Rebase and fix merge conflict

* formatting

* MR comments

* clang format

* Fix double StateNew

---------

Co-authored-by: Iman Tabrizian <itabrizian@nvidia.com>
---
 src/libtorch.cc | 339 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 296 insertions(+), 43 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 99732cf..6ac3536 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -105,7 +105,10 @@ class ModelState : public BackendModel {
   bool EnabledCacheCleaning() { return enable_cache_cleaning_; }
 
   bool EnabledWeightSharing() { return enable_weight_sharing_; }
-  const std::vector<std::string>& ModelOutputs() { return output_names_; }
+  const std::map<std::string, std::pair<int64_t, int64_t>>& ModelOutputs()
+  {
+    return model_outputs_;
+  }
 
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
@@ -145,9 +148,14 @@ class ModelState : public BackendModel {
       std::pair<bool, int64_t>, std::shared_ptr<torch::jit::script::Module>>
       torch_models_;
 
-  // List of all the outputs specified in the output section of model
-  // configuration.
-  std::vector<std::string> output_names_;
+  // model_outputs is a map that contains unique outputs that the model must
+  // provide. The first pair is the model output index and the second is
+  // the index in the model state, -1 is used if one is not required.
+  // In the model configuration, the output in the state configuration
+  // can have intersection with the outputs section of the model. If an output
+  // is specified both in the output section and state section, it indicates
+  // that the backend must return the output state to the client too.
+  std::map<std::string, std::pair<int64_t, int64_t>> model_outputs_;
 };
 
 TRITONSERVER_Error*
@@ -172,6 +180,49 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
     RETURN_IF_ERROR((*state)->SetModelConfig());
   }
 
+  auto& model_outputs = (*state)->model_outputs_;
+  // Parse the output states in the model configuration
+  triton::common::TritonJson::Value sequence_batching;
+  if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) {
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      for (size_t i = 0; i < states.ArraySize(); i++) {
+        triton::common::TritonJson::Value state;
+        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+        std::string output_state_name;
+        RETURN_IF_ERROR(
+            state.MemberAsString("output_name", &output_state_name));
+        auto it = model_outputs.find(output_state_name);
+        if (it == model_outputs.end()) {
+          model_outputs.insert({output_state_name, std::make_pair(-1, i)});
+        } else {
+          it->second.second = i;
+        }
+      }
+    }
+  }
+
+  // Parse the output names in the model configuration
+  triton::common::TritonJson::Value outputs;
+  RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs));
+  for (size_t i = 0; i < outputs.ArraySize(); i++) {
+    triton::common::TritonJson::Value output;
+    THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output));
+
+    // Use names from ModelConfig by reference since the model
+    // config will persist longer than this inference execution.
+    std::string output_name;
+    THROW_IF_BACKEND_INSTANCE_ERROR(
+        output.MemberAsString("name", &output_name));
+
+    auto it = model_outputs.find(output_name);
+    if (it == model_outputs.end()) {
+      model_outputs.insert({output_name, std::make_pair(i, -1)});
+    } else {
+      it->second.first = i;
+    }
+  }
+
   RETURN_IF_ERROR((*state)->ParseParameters());
 
   return nullptr;  // success
@@ -185,22 +236,6 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
       enable_jit_executor_pair_({false, true}),
       enable_nvfuser_pair_({false, false})
 {
-  output_names_.clear();
-
-  triton::common::TritonJson::Value ios;
-  THROW_IF_BACKEND_INSTANCE_ERROR(ModelConfig().MemberAsArray("output", &ios));
-  for (size_t i = 0; i < ios.ArraySize(); i++) {
-    triton::common::TritonJson::Value io;
-    THROW_IF_BACKEND_INSTANCE_ERROR(ios.IndexAsObject(i, &io));
-
-    // Use names from ModelConfig by reference since the model
-    // config will persist longer than this inference execution.
-    const char* io_name;
-    size_t io_name_len;
-    THROW_IF_BACKEND_INSTANCE_ERROR(
-        io.MemberAsString("name", &io_name, &io_name_len));
-    output_names_.emplace_back(io_name);
-  }
 }
 
 TRITONSERVER_Error*
@@ -698,6 +733,11 @@ ModelInstanceState::ModelInstanceState(
     if (have_corrid) {
       expected_input_cnt += 1;
     }
+    // Add the state inputs to the expected count
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      expected_input_cnt += states.ArraySize();
+    }
   }
   supports_batching_ = model_state_->MaxBatchSize() > 0;
 
@@ -991,6 +1031,47 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
       }
     }
   }
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state_->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      for (size_t i = 0; i < states.ArraySize(); i++) {
+        triton::common::TritonJson::Value state;
+        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+        std::string state_name;
+        RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name));
+        AddInputToMap(naming_convention, allowed_inputs, state_name, i);
+
+        // Validate data type
+        std::string state_dtype;
+        RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
+        const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
+        if (!pr.first && (state_dtype != "TYPE_STRING")) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              ("unsupported datatype " + state_dtype + " for input state '" +
+               state_name + "' for model '" + model_state_->Name() + "'")
+                  .c_str());
+        }
+
+        // Validate shape for String inputs. Only allow 1 dimension.
+        if (state_dtype == "TYPE_STRING") {
+          std::vector<int64_t> dims;
+          if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+            return TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INTERNAL,
+                ("Triton only supports 1 dimensional List of String as input "
+                 "for "
+                 "'" +
+                 std::string(state_name) + "' for model '" +
+                 model_state_->Name() + "'")
+                    .c_str());
+          }
+        }
+      }
+    }
+  }
 
   triton::common::TritonJson::Value batch_inputs;
   RETURN_IF_ERROR(
@@ -1085,6 +1166,54 @@ ModelInstanceState::ValidateOutputs()
     output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second);
   }
 
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state_->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      for (size_t i = 0; i < states.ArraySize(); i++) {
+        triton::common::TritonJson::Value state;
+        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+        std::string state_name;
+        RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name));
+        std::string state_dtype;
+        RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
+        std::vector<int64_t> dims;
+        RETURN_IF_ERROR(ParseShape(state, "dims", &dims));
+
+        // For state, naming convention is enforced to be NAMED_INDEX
+        int start_pos = state_name.find(deliminator);
+        op_index = std::atoi(state_name.substr(start_pos + 2).c_str());
+
+        const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
+        if (!pr.first && (state_dtype != "TYPE_STRING")) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              ("unsupported datatype " + state_dtype + " for state '" +
+               state_name + "' for model '" + model_state_->Name() + "'")
+                  .c_str());
+        }
+
+        // Validate shape for String outputs. Only allow 1 dimension.
+        if (state_dtype == "TYPE_STRING") {
+          if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+            return TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INTERNAL,
+                ("Triton only supports 1 dimensional List of String as output "
+                 "for "
+                 "'" +
+                 std::string(state_name) + "' for model '" +
+                 model_state_->Name() + "'")
+                    .c_str());
+          }
+        }
+
+        output_index_map_[state_name] = op_index;
+        output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second);
+      }
+    }
+  }
+
   return nullptr;  // success
 }
 
@@ -1274,14 +1403,14 @@ ModelInstanceState::ProcessRequests(
 
   if (!all_response_failed) {
     for (const auto& name : model_state_->ModelOutputs()) {
-      int op_index = output_index_map_[name];
+      int op_index = output_index_map_[name.first];
       if ((op_index < 0) || (op_index > max_index)) {
         RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
             responses, request_count, all_response_failed,
             TRITONSERVER_ErrorNew(
                 TRITONSERVER_ERROR_INVALID_ARG,
                 std::string(
-                    "The output " + std::string(name) +
+                    "The output " + std::string(name.first) +
                     " in the model configuration refers to an output index "
                     "which doesn't exist. This model has " +
                     std::to_string(max_index + 1) + " outputs")
@@ -1608,6 +1737,61 @@ ModelInstanceState::GetNamingConvention(
     }
   }
 
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state_->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    // If we need to manage state for the model, then we need to check
+    // the naming of the state adheres to both the input and output conventions
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      if (*naming_convention != NamingConvention::NAMED_INDEX) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            ("PyTorch model '" + model_state_->Name() +
+             "' is using sequence batching with state but not all inputs and "
+             "outputs follow the <name>__<index> naming convention. ")
+                .c_str());
+      }
+    }
+
+    for (size_t i = 0; i < states.ArraySize(); i++) {
+      triton::common::TritonJson::Value state;
+      RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+      std::string name_entry =
+          io_kind == "input" ? "input_name" : "output_name";
+      std::string state_name;
+      RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name));
+      int start_pos = state_name.find(deliminator);
+      if (start_pos == -1) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            ("PyTorch model '" + model_state_->Name() +
+             "' is using sequence batching with state but state '" +
+             state_name +
+             "' does not follow the <name>__<index> naming convention. ")
+                .c_str());
+      } else {
+        // check if the index part of the name is not an integer
+        std::string index_str = state_name.substr(start_pos + 2);
+        bool is_int = true;
+        for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+          if (std::isdigit(*itr) == 0) {
+            is_int = false;
+          }
+        }
+        if (!is_int) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              ("PyTorch model '" + model_state_->Name() +
+               "' is using sequence batching with state but state '" +
+               state_name +
+               "' does not follow the <name>__<index> naming convention. ")
+                  .c_str());
+        }
+      }
+    }
+  }
+
   return nullptr;  // success
 }
 
@@ -1789,10 +1973,11 @@ SetStringInputTensor(
 }
 
 bool
-SetStringOutputBuffer(
+SetStringBuffer(
     torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
-    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
-    cudaStream_t stream, std::string* serialized)
+    TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
+    const size_t tensor_element_count, cudaStream_t stream,
+    std::string* serialized, bool state)
 {
   bool cuda_copy = false;
 
@@ -1814,15 +1999,26 @@ SetStringOutputBuffer(
   TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
   int64_t actual_memory_type_id = 0;
 
+  TRITONSERVER_Error* err;
   void* buffer;
-  auto err = TRITONBACKEND_OutputBuffer(
-      response_output, &buffer, serialized->size(), &actual_memory_type,
-      &actual_memory_type_id);
-  if (err != nullptr) {
-    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
-    return cuda_copy;
-  }
 
+  if (!state) {
+    auto err = TRITONBACKEND_OutputBuffer(
+        response_output, &buffer, serialized->size(), &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  } else {
+    auto err = TRITONBACKEND_StateBuffer(
+        response_state, &buffer, serialized->size(), &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  }
   // Copy the serialized tensor into the allocated buffer.
   bool cuda_used = false;
   err = CopyBuffer(
@@ -1837,9 +2033,38 @@ SetStringOutputBuffer(
     return cuda_copy;
   }
 
+  if (state) {
+    RESPOND_AND_SET_NULL_IF_ERROR(
+        response, TRITONBACKEND_StateUpdate(response_state));
+  }
+
   return cuda_copy;
 }
 
+
+bool
+SetStringOutputBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized)
+{
+  return SetStringBuffer(
+      tensor, response, response_output, nullptr /* response_state */,
+      tensor_element_count, stream, serialized, false /* state */);
+}
+
+bool
+SetStringStateBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_State* response_state, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized)
+{
+  return SetStringBuffer(
+      tensor, response, nullptr /* response_output */, response_state,
+      tensor_element_count, stream, serialized, true /* state */);
+}
+
+
 TRITONSERVER_Error*
 ModelInstanceState::SetInputTensors(
     size_t total_batch_size, TRITONBACKEND_Request** requests,
@@ -2026,9 +2251,10 @@ ModelInstanceState::ReadOutputTensors(
   bool cuda_copy = false;
   // The serialized string buffer must be valid until output copies are done
   std::vector<std::unique_ptr<std::string>> string_buffer;
-  for (size_t idx = 0; idx < model_state_->ModelOutputs().size(); idx++) {
-    std::string name = model_state_->ModelOutputs()[idx];
-    int op_index = output_index_map_[name];
+  for (auto& output : model_state_->ModelOutputs()) {
+    int op_index = output_index_map_[output.first];
+    auto name = output.first;
+    auto output_tensor_pair = output.second;
 
     if (output_tensors[op_index].isTensor()) {
       torch::Tensor output_flat;
@@ -2086,10 +2312,22 @@ ModelInstanceState::ReadOutputTensors(
                "' is a scalar which is not supported.")
                   .c_str());
         }
+        if (output_tensor_pair.first != -1) {
+          responder.ProcessTensor(
+              name, output_dtype, batchn_shape, output_buffer, memory_type,
+              memory_id);
+        }
+        if (output_tensor_pair.second != -1) {
+          std::vector<TRITONBACKEND_State*> states;
+          states = responder.ProcessStateTensor(
+              name, output_dtype, batchn_shape, output_buffer, memory_type,
+              memory_id);
+          // Update the states
+          for (auto& state : states) {
+            RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state));
+          }
+        }
 
-        responder.ProcessTensor(
-            name, output_dtype, batchn_shape, output_buffer, memory_type,
-            memory_id);
       } else {
         responder.ProcessBatchOutput(
             name, *batch_output, output_buffer, memory_type, memory_id);
@@ -2119,15 +2357,30 @@ ModelInstanceState::ReadOutputTensors(
 
         // Only need an response tensor for requested outputs.
         if (response != nullptr) {
-          TRITONBACKEND_Output* response_output;
+          if (output_tensor_pair.first != -1) {
+            TRITONBACKEND_Output* response_output;
+            RESPOND_AND_SET_NULL_IF_ERROR(
+                &response, TRITONBACKEND_ResponseOutput(
+                               response, &response_output, name.c_str(),
+                               TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
+                               batchn_shape.size()));
+            string_buffer.emplace_back(new std::string());
+            cuda_copy |= SetStringOutputBuffer(
+                &output_list, &response, response_output, tensor_element_cnt,
+                GetCudaStreamByInstanceKind(), string_buffer.back().get());
+          }
+        }
+        if (output_tensor_pair.second != -1) {
+          TRITONBACKEND_State* response_state;
           RESPOND_AND_SET_NULL_IF_ERROR(
-              &response, TRITONBACKEND_ResponseOutput(
-                             response, &response_output, name.c_str(),
+              &response, TRITONBACKEND_StateNew(
+                             &response_state, request, name.c_str(),
                              TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
                              batchn_shape.size()));
+
           string_buffer.emplace_back(new std::string());
-          cuda_copy |= SetStringOutputBuffer(
-              &output_list, &response, response_output, tensor_element_cnt,
+          cuda_copy |= SetStringStateBuffer(
+              &output_list, &response, response_state, tensor_element_cnt,
               GetCudaStreamByInstanceKind(), string_buffer.back().get());
         }
       }

From 304c2e82bfd7a14359e56cecc11ce1c222d48cfd Mon Sep 17 00:00:00 2001
From: dyastremsky <58150256+dyastremsky@users.noreply.github.com>
Date: Tue, 11 Jul 2023 10:47:29 -0700
Subject: [PATCH 45/76] Add GitHub action to format and lint code (#116)

* Add and run pre-commit hooks

* Restore clang-format

* Fix yaml spacing

* Normalize spacing

* Normalize config indentation

* Update line limit in clang-format to 80 chars

* Update workflows to run on every PR

* Update copyright
---
 .clang-format                    |  4 +-
 .github/workflows/pre-commit.yml | 38 +++++++++++++++++
 .pre-commit-config.yaml          | 73 ++++++++++++++++++++++++++++++++
 CMakeLists.txt                   |  8 ++--
 README.md                        |  2 +-
 pyproject.toml                   | 49 +++++++++++++++++++++
 src/libtorch.cc                  |  2 +-
 7 files changed, 169 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/pre-commit.yml
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 pyproject.toml

diff --git a/.clang-format b/.clang-format
index 98c6497..1defc17 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,6 +2,7 @@
 BasedOnStyle: Google
 
 IndentWidth: 2
+ColumnLimit: 80
 ContinuationIndentWidth: 4
 UseTab: Never
 MaxEmptyLinesToKeep: 2
@@ -34,4 +35,5 @@ BinPackArguments: true
 BinPackParameters: true
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 
-IndentCaseLabels: true
\ No newline at end of file
+IndentCaseLabels: true
+
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000..ab4bd95
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,38 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: pre-commit
+
+on:
+  pull_request:
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v3
+    - uses: pre-commit/action@v3.0.0
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..298baab
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,73 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+repos:
+- repo: https://github.com/timothycrosley/isort
+  rev: 5.12.0
+  hooks:
+  - id: isort
+    additional_dependencies: [toml]
+- repo: https://github.com/psf/black
+  rev: 23.1.0
+  hooks:
+  - id: black
+    types_or: [python, cython]
+- repo: https://github.com/PyCQA/flake8
+  rev: 5.0.4
+  hooks:
+  - id: flake8
+    args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
+    types_or: [python, cython]
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v16.0.5
+  hooks:
+  - id: clang-format
+    types_or: [c, c++, cuda, proto, textproto, java]
+    args: ["-fallback-style=none", "-style=file", "-i"]
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.2.4
+  hooks:
+  - id: codespell
+    additional_dependencies: [tomli]
+    args: ["--toml", "pyproject.toml"]
+    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
+# More details about these pre-commit hooks here:
+# https://pre-commit.com/hooks.html
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+  - id: check-case-conflict
+  - id: check-executables-have-shebangs
+  - id: check-merge-conflict
+  - id: check-json
+  - id: check-toml
+  - id: check-yaml
+  - id: check-shebang-scripts-are-executable
+  - id: end-of-file-fixer
+    types_or: [c, c++, cuda, proto, textproto, java, python]
+  - id: mixed-line-ending
+  - id: requirements-txt-fixer
+  - id: trailing-whitespace
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 443a70a..45ff129 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -85,7 +85,7 @@ find_package(Python3 REQUIRED COMPONENTS Development)
 #
 # Dependencies
 #
-# FetchContent's composibility isn't very good. We must include the
+# FetchContent's composability isn't very good. We must include the
 # transitive closure of all repos so that we can override the tag.
 #
 include(FetchContent)
@@ -181,7 +181,7 @@ set(OPENCV_LIBS
     "libopencv_highgui.so"
     "libopencv_imgcodecs.so"
     "libopencv_imgproc.so"
-    "libopencv_core.so" 
+    "libopencv_core.so"
     "libopencv_calib3d.so"
     "libopencv_flann.so"
     "libopencv_features2d.so"
@@ -190,7 +190,7 @@ set(OPENCV_LIBS
 )
 
 # The patchelf commands ensure the MKL libraries are loaded correctly during runtime
-# Without these, the framework/backend complains of missing libraries / symbols and 
+# Without these, the framework/backend complains of missing libraries / symbols and
 # in some cases leads to segmentation faults.
 if (${TRITON_PYTORCH_DOCKER_BUILD})
   string(REPLACE ";" " " LIBTORCH_LIBS_STR "${LIBTORCH_LIBS}")
diff --git a/README.md b/README.md
index 0f82395..22ef270 100644
--- a/README.md
+++ b/README.md
@@ -206,7 +206,7 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
 
     `ENABLE_TENSOR_FUSER`
 
-### Support 
+### Support
 
 #### Model Instance Group Kind
 
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..1a8da1f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,49 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+[tool.codespell]
+# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
+# this is only to allow you to run codespell interactively
+skip = "./.git,./.github"
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
+# use the 'clear' dictionary for unambiguous spelling mistakes
+builtin = "clear"
+# disable warnings about binary files and wrong encoding
+quiet-level = 3
+
+[tool.isort]
+profile = "black"
+use_parentheses = true
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+ensure_newline_before_comments = true
+line_length = 88
+balanced_wrapping = true
+indent = "    "
+skip = ["build"]
+
diff --git a/src/libtorch.cc b/src/libtorch.cc
index 6ac3536..2731094 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1552,7 +1552,7 @@ ModelInstanceState::Execute(
           std::get<1>(model_state_->EnabledJitExecutor());
     }
 
-    // Fuser. Parameter is ignored if NVFuser parameter is explicitily
+    // Fuser. Parameter is ignored if NVFuser parameter is explicitly
     // set (either enabled or disabled). No change is made unless
     // fuser is explicitly set in parameters.
     if (!std::get<0>(model_state_->EnabledNvfuserPair()) &&

From 48e2e297834be634f139623c1db9c868ea7f25b6 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Tue, 26 Dec 2023 08:28:18 -0800
Subject: [PATCH 46/76] Update `main` branch post-23.12 release (#121)

* Fix library list (#118)

Remove nvfuser header

Co-authored-by: Misha Chornyi <mchornyi@nvidia.com>

* Remove nvfuser implementation (#119)

---------

Co-authored-by: Tanmay Verma <tanmay2592@gmail.com>
---
 CMakeLists.txt       |  2 --
 README.md            | 22 ---------------
 src/libtorch.cc      | 64 ++++----------------------------------------
 src/libtorch_utils.h |  1 -
 4 files changed, 5 insertions(+), 84 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45ff129..a4ccae8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,7 +138,6 @@ set(PT_LIBS
     "libtorch_cuda.so"
     "libtorch_cuda_linalg.so"
     "libtorch_global_deps.so"
-    "libnvfuser_codegen.so"
 )
 
 if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
@@ -214,7 +213,6 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libnvfuser_codegen.so libnvfuser_codegen.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so
diff --git a/README.md b/README.md
index 22ef270..2c14421 100644
--- a/README.md
+++ b/README.md
@@ -144,26 +144,6 @@ key: "INFERENCE_MODE"
 }
 ```
 
-* `ENABLE_NVFUSER`: Boolean flag to enable the NvFuser (CUDA Graph
-Fuser) optimization for TorchScript models. If not specified, the
-default PyTorch fuser is used. If `ENABLE_NVFUSER` is specified, the
-`ENABLE_TENSOR_FUSER` configuration (see below) is ignored.
-
-Please note that in some models generated using trace in old PyTorch versions might not work
-correctly with NvFuser. We recommend using scripting and a recent version of PyTorch
-to generate these models.
-
-The section of model config file specifying this parameter will look like:
-
-```
-parameters: {
-key: "ENABLE_NVFUSER"
-    value: {
-    string_value: "true"
-    }
-}
-```
-
 * `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to
 share weights. This optimization should not be used with stateful models. If not specified,
 weight sharing is disabled.
@@ -204,8 +184,6 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
 
     `ENABLE_JIT_PROFILING`
 
-    `ENABLE_TENSOR_FUSER`
-
 ### Support
 
 #### Model Instance Group Kind
diff --git a/src/libtorch.cc b/src/libtorch.cc
index 2731094..472e19d 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -98,10 +98,6 @@ class ModelState : public BackendModel {
     return enable_jit_executor_pair_;
   }
   bool EnabledInferenceMode() { return enable_inference_mode_; }
-  const std::pair<bool, bool>& EnabledNvfuserPair() const
-  {
-    return enable_nvfuser_pair_;
-  }
   bool EnabledCacheCleaning() { return enable_cache_cleaning_; }
 
   bool EnabledWeightSharing() { return enable_weight_sharing_; }
@@ -132,16 +128,11 @@ class ModelState : public BackendModel {
 
   // Flag pairs to indicate if various JIT settings are set and
   // enabled respectively. Defaults to (false, true). Default behavior
-  // is to do nothing if not explicitly set. Tensor fuser flag is
-  // ignore if nvfuser is explicitly set.
+  // is to do nothing if not explicitly set.
   std::pair<bool, bool> enable_tensor_fuser_pair_;
   std::pair<bool, bool> enable_jit_profiling_pair_;
   std::pair<bool, bool> enable_jit_executor_pair_;
 
-  // Flag pair to indicate whether nvfuser is set and enabled respectively.
-  // Defaults to (false, false).
-  std::pair<bool, bool> enable_nvfuser_pair_;
-
   // Model mapping for shared TorchScript model across all instances on the
   // same device. The key is a pair of isGPU and device index.
   std::map<
@@ -233,8 +224,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
       enable_inference_mode_(true), enable_cache_cleaning_(false),
       enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
-      enable_jit_executor_pair_({false, true}),
-      enable_nvfuser_pair_({false, false})
+      enable_jit_executor_pair_({false, true})
 {
 }
 
@@ -475,29 +465,6 @@ ModelState::ParseParameters()
            " for model instance '" + Name() + "'")
               .c_str());
     }
-
-    // If 'ENABLE_NVFUSER' is not present in 'parameters' then no
-    // update is made to 'enable_nvfuser'.
-    bool enable_nvfuser = false;
-    err = ParseParameter(params, "ENABLE_NVFUSER", &enable_nvfuser);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_INFO, (std::string("NvFuser is not specified") +
-                                    " for model instance '" + Name() + "'")
-                                       .c_str());
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      enable_nvfuser_pair_ = {true, enable_nvfuser};
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO, (std::string("NvFuser is ") +
-                                  (enable_nvfuser ? "enabled" : "disabled") +
-                                  " for model instance '" + Name() + "'")
-                                     .c_str());
-    }
   }
 
   return nullptr;
@@ -1552,34 +1519,13 @@ ModelInstanceState::Execute(
           std::get<1>(model_state_->EnabledJitExecutor());
     }
 
-    // Fuser. Parameter is ignored if NVFuser parameter is explicitly
-    // set (either enabled or disabled). No change is made unless
-    // fuser is explicitly set in parameters.
-    if (!std::get<0>(model_state_->EnabledNvfuserPair()) &&
-        std::get<0>(model_state_->EnabledTensorExprFuser())) {
+    // Fuser. No change is made unless fuser is explicitly set in
+    // parameters.
+    if (std::get<0>(model_state_->EnabledTensorExprFuser())) {
       torch::jit::setTensorExprFuserEnabled(
           std::get<1>(model_state_->EnabledTensorExprFuser()));
     }
 
-    // NV-Fuser. No change is made unless parameter is explicitly set.
-    if (std::get<0>(model_state_->EnabledNvfuserPair())) {
-      bool is_device_gpu =
-          (device_.is_cuda() ||
-           ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) &&
-            (device_cnt_ > 0)));
-      if (std::get<1>(model_state_->EnabledNvfuserPair()) && is_device_gpu) {
-        torch::jit::overrideCanFuseOnCPU(false);
-        torch::jit::overrideCanFuseOnGPU(false);
-        torch::jit::setTensorExprFuserEnabled(false);
-        torch::jit::fuser::cuda::setEnabled(true);
-      } else {
-        torch::jit::overrideCanFuseOnCPU(true);
-        torch::jit::overrideCanFuseOnGPU(true);
-        torch::jit::setTensorExprFuserEnabled(true);
-        torch::jit::fuser::cuda::setEnabled(false);
-      }
-    }
-
     torch::NoGradGuard no_grad;
 
     // If input is a dictionary, prepare dictionary from 'input_tensors'.
diff --git a/src/libtorch_utils.h b/src/libtorch_utils.h
index a8f0c0d..e9f9cff 100644
--- a/src/libtorch_utils.h
+++ b/src/libtorch_utils.h
@@ -35,7 +35,6 @@
 #pragma warning(push, 0)
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
-#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/script.h>  // One-stop header for TorchScript

From d9005389bea484c5490d83f14f6ed12489e27348 Mon Sep 17 00:00:00 2001
From: Yongbin Feng <ybfeng94@gmail.com>
Date: Thu, 4 Jan 2024 05:50:12 +0800
Subject: [PATCH 47/76] fix the crashing when there are zero-size inputs (#120)

* fix the crashing when there are zero-size inputs

* Typo

---------

Co-authored-by: Iman Tabrizian <itabrizian@nvidia.com>
---
 src/libtorch.cc | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index 472e19d..da9e193 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -2136,10 +2136,18 @@ ModelInstanceState::SetInputTensors(
 
       (*input_tensors)[input_index_map_[input_name]] = input_list;
     } else {
-      // Remove constness to align with the signature of torch::from_blob()
-      torch::Tensor input_tensor = torch::from_blob(
-          const_cast<char*>(input_buffer), batchn_shape, updated_options);
-      (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      if (batchn_byte_size) {
+        // Remove constness to align with the signature of torch::from_blob()
+        torch::Tensor input_tensor = torch::from_blob(
+            const_cast<char*>(input_buffer), batchn_shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      } else {
+        // torch:from_blob seems not working when the input size is 0
+        // create zero-length inputs directly
+        torch::Tensor input_tensor =
+            torch::zeros(batchn_shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      }
     }
   }
 
@@ -2168,9 +2176,15 @@ ModelInstanceState::SetInputTensors(
                                  ? options.device(torch::kCUDA, device_.index())
                                  : options.device(torch::kCPU);
 
-      torch::Tensor input_tensor = torch::from_blob(
-          const_cast<char*>(dst_buffer), shape, updated_options);
-      (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      if (dst_buffer_byte_size) {
+        torch::Tensor input_tensor = torch::from_blob(
+            const_cast<char*>(dst_buffer), shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      } else {
+        // special handle when input has zero size
+        torch::Tensor input_tensor = torch::zeros(shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      }
     }
   }
 

From 74683817529a7e682d49706322f65a125543d5d3 Mon Sep 17 00:00:00 2001
From: Jacky <18255193+kthui@users.noreply.github.com>
Date: Thu, 11 Jan 2024 09:11:11 -0800
Subject: [PATCH 48/76] Bring back Python backend based PyTorch backend (#117)

* Add Python backend based PyTorch runtime

* Add exec env build

* Add note for adding .pt2 model support

* Do not specify pytorch cuda version

* Do not install Python runtime on non x86

* Remove legacy comment

* User to build PyTorch env

* Add docs

* Update copyright

* Clarify model layout between PyTorch and TorchScript

* Fix header size
---
 CMakeLists.txt           |   9 +-
 README.md                | 126 +++++++++++++++-
 src/model.py             | 315 +++++++++++++++++++++++++++++++++++++++
 tools/gen_pb_exec_env.sh |  52 +++++++
 4 files changed, 500 insertions(+), 2 deletions(-)
 create mode 100755 src/model.py
 create mode 100755 tools/gen_pb_exec_env.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4ccae8..517481c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -502,6 +502,13 @@ install(
     ${INSTALL_CONFIGDIR}
 )
 
+install(
+  FILES
+    src/model.py
+  DESTINATION
+    ${CMAKE_INSTALL_PREFIX}/backends/pytorch
+)
+
 include(CMakePackageConfigHelpers)
 configure_package_config_file(
   ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonPyTorchBackendConfig.cmake.in
diff --git a/README.md b/README.md
index 2c14421..b82c774 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -243,3 +243,127 @@ instance in the
 [model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
 to ensure that the model instance and the tensors used for inference are
 assigned to the same GPU device as on which the model was traced.
+
+# PyTorch 2.0 Backend \[Experimental\]
+
+> [!WARNING]
+> *This feature is subject to change and removal.*
+
+Starting from 24.01, PyTorch models can be served directly via
+[Python runtime](src/model.py). By default, Triton will use the
+[LibTorch runtime](#pytorch-libtorch-backend) for PyTorch models. To use Python
+runtime, provide the following
+[runtime setting](https://github.com/triton-inference-server/backend/blob/main/README.md#backend-shared-library)
+in the model configuration:
+
+```
+runtime: "model.py"
+```
+
+## Dependencies
+
+### Python backend dependency
+
+This feature depends on
+[Python backend](https://github.com/triton-inference-server/python_backend),
+see
+[Python-based Backends](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md)
+for more details.
+
+### PyTorch dependency
+
+This feature will take advantage of the
+[`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile)
+optimization, make sure the
+[PyTorch 2.0+ pip package](https://pypi.org/project/torch) is available in the
+same Python environment.
+
+Alternatively, a [Python Execution Environment](#using-custom-python-execution-environments)
+with the PyTorch dependency may be used. It can be created with the
+[provided script](tools/gen_pb_exec_env.sh). The resulting
+`pb_exec_env_model.py.tar.gz` file should be placed at the same
+[backend shared library](https://github.com/triton-inference-server/backend/blob/main/README.md#backend-shared-library)
+directory as the [Python runtime](src/model.py).
+
+## Model Layout
+
+### PyTorch 2.0 models
+
+The model repository should look like:
+
+```
+model_repository/
+`-- model_directory
+    |-- 1
+    |   |-- model.py
+    |   `-- [model.pt]
+    `-- config.pbtxt
+```
+
+The `model.py` contains the class definition of the PyTorch model. The class
+should extend the
+[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
+The `model.pt` may be optionally provided which contains the saved
+[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference)
+of the model.
+
+### TorchScript models
+
+The model repository should look like:
+
+```
+model_repository/
+`-- model_directory
+    |-- 1
+    |   `-- model.pt
+    `-- config.pbtxt
+```
+
+The `model.pt` is the TorchScript model file.
+
+## Customization
+
+The following PyTorch settings may be customized by setting parameters on the
+`config.pbtxt`.
+
+[`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads)
+- Key: NUM_THREADS
+- Value: The number of threads used for intraop parallelism on CPU.
+
+[`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads)
+- Key: NUM_INTEROP_THREADS
+- Value: The number of threads used for interop parallelism (e.g. in JIT
+interpreter) on CPU.
+
+[`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile)
+- Key: TORCH_COMPILE_OPTIONAL_PARAMETERS
+- Value: Any of following parameter(s) encoded as a JSON object.
+  - fullgraph (*bool*): Whether it is ok to break model into several subgraphs.
+  - dynamic (*bool*): Use dynamic shape tracing.
+  - backend (*str*): The backend to be used.
+  - mode (*str*): Can be either "default", "reduce-overhead" or "max-autotune".
+  - options (*dict*): A dictionary of options to pass to the backend.
+  - disable (*bool*): Turn `torch.compile()` into a no-op for testing.
+
+For example:
+```
+parameters: {
+    key: "NUM_THREADS"
+    value: { string_value: "4" }
+}
+parameters: {
+    key: "TORCH_COMPILE_OPTIONAL_PARAMETERS"
+    value: { string_value: "{\"disable\": true}" }
+}
+```
+
+## Limitations
+
+Following are few known limitations of this feature:
+- Python functions optimizable by `torch.compile` may not be served directly in
+the `model.py` file, they need to be enclosed by a class extending the
+[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
+- Model weights cannot be shared across multiple instances on the same GPU
+device.
+- When using `KIND_MODEL` as model instance kind, the default device of the
+first parameter on the model is used.
diff --git a/src/model.py b/src/model.py
new file mode 100755
index 0000000..d8ed413
--- /dev/null
+++ b/src/model.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import importlib
+import json
+import os
+
+try:
+    import torch
+except ModuleNotFoundError as error:
+    raise RuntimeError("Missing/Incomplete PyTorch package installation") from error
+
+import triton_python_backend_utils as pb_utils
+
+
+def _get_model_path(config):
+    # FIXME: Add support for torch.export IR models (.pt2)
+    filenames = ["model.py", "model.pt"]
+    if config["default_model_filename"]:
+        filenames.insert(0, config["default_model_filename"])
+    for filename in filenames:
+        model_path = os.path.join(pb_utils.get_model_dir(), filename)
+        if os.path.exists(model_path):
+            return model_path
+    raise pb_utils.TritonModelException(
+        "No model found in " + pb_utils.get_model_dir() + "/" + str(filenames)
+    )
+
+
+def _get_model_data_path(model_path):
+    data_path_extensions = [".pt"]
+    model_path_no_extension = model_path[: -(len(model_path.split(".")[-1]) + 1)]
+    for extension in data_path_extensions:
+        data_path = model_path_no_extension + extension
+        if os.path.exists(data_path):
+            return data_path
+    # data file not provided
+    return ""
+
+
+def _is_py_class_model(model_path):
+    return model_path[-3:] == ".py"
+
+
+def _import_module_from_path(module_name, file_path):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def _get_model_class_from_module(module):
+    names = dir(module)
+    for name in names:
+        attr = getattr(module, name)
+        try:
+            if issubclass(attr, torch.nn.Module):
+                return attr
+        except TypeError:
+            # attr may not be a class
+            pass
+    raise pb_utils.TritonModelException("Cannot find a subclass of torch.nn.Module")
+
+
+def _parse_io_config(io_config):
+    io = []
+    for conf in io_config:
+        io.append({"name": conf["name"]})
+    return io
+
+
+def _get_device_name(kind, device_id):
+    if kind == "GPU":
+        return "cuda:" + device_id
+    if kind == "CPU":
+        return "cpu"
+    # unspecified device
+    return ""
+
+
+def _get_device(kind, device_id, model):
+    device_name = _get_device_name(kind, device_id)
+    if device_name == "":
+        for param in model.parameters():
+            return param.device
+        raise pb_utils.TritonModelException("Cannot determine model device")
+    return torch.device(device_name)
+
+
+def _set_torch_parallelism(config):
+    log_msg = ""
+    parallelism_settings = ["NUM_THREADS", "NUM_INTEROP_THREADS"]
+    for setting in parallelism_settings:
+        val = "1"
+        if setting in config["parameters"]:
+            val = config["parameters"][setting]["string_value"]
+        getattr(torch, "set_" + setting.lower())(int(val))
+        log_msg += setting + " = " + val + "; "
+    return log_msg
+
+
+def _get_torch_compile_params(config):
+    params = {}
+    if "TORCH_COMPILE_OPTIONAL_PARAMETERS" in config["parameters"]:
+        val = config["parameters"]["TORCH_COMPILE_OPTIONAL_PARAMETERS"]["string_value"]
+        params = json.loads(val)
+        if "model" in params:
+            raise pb_utils.TritonModelException(
+                "'model' is not an optional parameter for 'torch.compile'"
+            )
+    return params
+
+
+def _gather_torch_tensors(scatter_tensors):
+    gather_tensors = []
+    sections = []
+    for i in range(len(scatter_tensors)):
+        tensors = scatter_tensors[i]
+        for j in range(len(tensors)):
+            tensor = tensors[j]
+            if j < len(gather_tensors):
+                # add to existing tensor
+                gather_tensors[j] = torch.cat((gather_tensors[j], tensor), 0)
+            else:
+                # start a new tensor
+                gather_tensors.append(tensor)
+        # record section
+        section_length = tensors[0].size()[0]
+        sections.append(section_length)
+    return gather_tensors, sections
+
+
+def _scatter_torch_tensors(gather_tensors, sections):
+    scatter_tensors = []
+    for j in range(len(gather_tensors)):
+        scatter_tensor = torch.split(gather_tensors[j], sections)
+        for i in range(len(scatter_tensor)):
+            tensor = scatter_tensor[i]
+            if i < len(scatter_tensors):
+                # add to existing response
+                scatter_tensors[i].append(tensor)
+            else:
+                # start a new response
+                scatter_tensors.append([tensor])
+    return scatter_tensors
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self._model_name = args["model_name"]
+        for_model = "for '" + self._model_name + "'"
+        self._logger = pb_utils.Logger
+        self._logger.log_info("Initializing model instance " + for_model)
+
+        self._model_config = json.loads(args["model_config"])
+        self._kind = args["model_instance_kind"]
+        self._device_id = args["model_instance_device_id"]
+        self._support_batching = self._model_config["max_batch_size"] > 0
+        self._inputs = _parse_io_config(self._model_config["input"])
+        self._outputs = _parse_io_config(self._model_config["output"])
+
+        setting_msg = _set_torch_parallelism(self._model_config)
+        self._logger.log_verbose(
+            "Torch parallelism settings " + for_model + ": " + setting_msg
+        )
+
+        self._infer_mode = torch.inference_mode(mode=True)
+        self._infer_mode.__enter__()
+
+        params = _get_torch_compile_params(self._model_config)
+        self._logger.log_verbose(
+            "'torch.compile' optional parameter(s) " + for_model + ": " + str(params)
+        )
+        if self._support_batching:
+            self._gather = torch.compile(_gather_torch_tensors, **params)
+            self._scatter = torch.compile(_scatter_torch_tensors, **params)
+
+        model_path = _get_model_path(self._model_config)
+        if not _is_py_class_model(model_path):
+            self._logger.log_info("Loading '" + self._model_name + "' as TorchScript")
+            self._model = torch.jit.load(model_path)
+            self._device = _get_device(self._kind, self._device_id, self._model)
+            self._model.to(self._device)
+            self._model.eval()
+            return
+
+        self._model_module = _import_module_from_path(self._model_name, model_path)
+        self._model_class = _get_model_class_from_module(self._model_module)
+        self._raw_model = self._model_class()
+        self._device = _get_device(self._kind, self._device_id, self._raw_model)
+        data_path = _get_model_data_path(model_path)
+        if data_path != "":
+            self._raw_model.load_state_dict(
+                torch.load(data_path, map_location=self._device)
+            )
+        else:
+            self._logger.log_info("Model parameter file not found " + for_model)
+        self._raw_model.to(self._device)
+        self._raw_model.eval()
+        self._model = torch.compile(self._raw_model, **params)
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        responses = []
+
+        requests_tensors = []
+        for request in requests:
+            tensors = []
+            for io in self._inputs:
+                tensor = pb_utils.get_input_tensor_by_name(
+                    request, io["name"]
+                ).to_dlpack()
+                tensor = torch.from_dlpack(tensor).to(self._device)
+                tensors.append(tensor)
+            requests_tensors.append(tensors)
+
+        sections = None
+        if self._support_batching:
+            requests_tensors, sections = self._gather(requests_tensors)
+            requests_tensors = [requests_tensors]
+
+        responses_tensors = []
+        for input_tensors in requests_tensors:
+            output_tensors = self._model(*input_tensors)
+            if not isinstance(output_tensors, tuple) and not isinstance(
+                output_tensors, list
+            ):
+                output_tensors = [output_tensors]
+            responses_tensors.append(output_tensors)
+
+        if self._support_batching:
+            responses_tensors = self._scatter(responses_tensors[0], sections)
+
+        for response_tensors in responses_tensors:
+            output_tensors = []
+            for i in range(len(self._outputs)):
+                io = self._outputs[i]
+                tensor = response_tensors[i].detach()
+                tensor = pb_utils.Tensor.from_dlpack(io["name"], tensor)
+                output_tensors.append(tensor)
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=output_tensors
+            )
+            responses.append(inference_response)
+
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        self._logger.log_info("Removing model instance for '" + self._model_name + "'")
+        self._infer_mode.__exit__(exc_type=None, exc_value=None, traceback=None)
diff --git a/tools/gen_pb_exec_env.sh b/tools/gen_pb_exec_env.sh
new file mode 100755
index 0000000..f89f0e2
--- /dev/null
+++ b/tools/gen_pb_exec_env.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# install conda
+rm -rf ./miniconda
+wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh
+bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh -p ./miniconda -b
+eval "$(./miniconda/bin/conda shell.bash hook)"
+
+# create conda environment
+conda create -n pt python=3.10 -y
+conda activate pt
+conda install -c conda-forge conda-pack -y
+
+# pre install step
+export PYTHONNOUSERSITE=True
+conda install -c conda-forge libstdcxx-ng=12 -y
+
+# install PyTorch
+conda install pytorch torchvision torchaudio pytorch-cuda -c pytorch -c nvidia -y
+
+# pack environment
+rm -f pb_exec_env_model.py.tar.gz
+conda pack -o pb_exec_env_model.py.tar.gz
+
+# deactivate conda
+conda deactivate
+conda deactivate

From 0931f9d5e5409ee02851da512a99705636af8d38 Mon Sep 17 00:00:00 2001
From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com>
Date: Thu, 11 Jan 2024 12:02:03 -0800
Subject: [PATCH 49/76] Bumping min required cxx standard to 17 (#122)

---
 CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 517481c..50c4256 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,9 @@ cmake_minimum_required (VERSION 3.18)
 
 project(tritonpytorchbackend LANGUAGES C CXX)
 
+# Use C++17 standard as Triton's minimum required.
+set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.")
+
 #
 # Options
 #
@@ -295,7 +298,7 @@ endif() # TRITON_PYTORCH_DOCKER_BUILD
 # Need to turn off -Werror due to Torchvision vision.h extern initialization
 # Unfortunately gcc does not provide a specific flag to ignore the specific
 # warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=45977
-target_compile_features(triton-pytorch-backend PRIVATE cxx_std_11)
+target_compile_features(triton-pytorch-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})
 target_compile_options(
   triton-pytorch-backend PRIVATE
   $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:

From 4fa7daaea92b1217eda30f658bda89f54aab1bca Mon Sep 17 00:00:00 2001
From: Kyle McGill <101670481+nv-kmcgill53@users.noreply.github.com>
Date: Wed, 6 Mar 2024 14:07:45 -0800
Subject: [PATCH 50/76] patching git repository parameterization from
 production branch 1 (#124)

Co-authored-by: kyle <kmcgill@kmcgill-ubuntu.nvidia.com>
---
 CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50c4256..438e53e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ set(TRITON_PYTORCH_DOCKER_IMAGE "" CACHE STRING "Docker image containing the PyT
 set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes")
 set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries")
 
+set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from")
 set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
 set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
 set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
@@ -95,19 +96,19 @@ include(FetchContent)
 
 FetchContent_Declare(
   repo-common
-  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git
   GIT_TAG ${TRITON_COMMON_REPO_TAG}
   GIT_SHALLOW ON
 )
 FetchContent_Declare(
   repo-core
-  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git
   GIT_TAG ${TRITON_CORE_REPO_TAG}
   GIT_SHALLOW ON
 )
 FetchContent_Declare(
   repo-backend
-  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git
   GIT_TAG ${TRITON_BACKEND_REPO_TAG}
   GIT_SHALLOW ON
 )

From c50d65bbab0d84006054d829f94b2bb81fc9fe50 Mon Sep 17 00:00:00 2001
From: Yongbin Feng <ybfeng94@gmail.com>
Date: Thu, 18 Apr 2024 16:30:06 -0500
Subject: [PATCH 51/76] add thread control for pytorch backend (#125)

* add pytorch thread control

* use function overloading and update copyright years
---
 src/libtorch.cc       | 56 ++++++++++++++++++++++++++++++++++++++++++-
 src/libtorch_utils.cc | 15 +++++++++++-
 src/libtorch_utils.h  |  9 ++++++-
 3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index da9e193..8809206 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -56,6 +56,12 @@
 #include <cuda_runtime_api.h>
 #endif  // TRITON_ENABLE_GPU
 
+// for thread control
+// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api
+// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
+#include <ATen/Parallel.h>
+
+
 //
 // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
 //
@@ -465,6 +471,54 @@ ModelState::ParseParameters()
            " for model instance '" + Name() + "'")
               .c_str());
     }
+
+    // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update
+    // is made to 'intra_op_thread_count', which by default will take all
+    // threads
+    int intra_op_thread_count = -1;
+    err = ParseParameter(
+        params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      if (intra_op_thread_count > 0) {
+        at::set_num_threads(intra_op_thread_count);
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("Intra op thread count is set to ") +
+             std::to_string(intra_op_thread_count) + " for model instance '" +
+             Name() + "'")
+                .c_str());
+      }
+    }
+
+    // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update
+    // is made to 'inter_op_thread_count', which by default will take all
+    // threads
+    int inter_op_thread_count = -1;
+    err = ParseParameter(
+        params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      if (inter_op_thread_count > 0) {
+        at::set_num_interop_threads(inter_op_thread_count);
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("Inter op thread count is set to ") +
+             std::to_string(inter_op_thread_count) + " for model instance '" +
+             Name() + "'")
+                .c_str());
+      }
+    }
   }
 
   return nullptr;
diff --git a/src/libtorch_utils.cc b/src/libtorch_utils.cc
index 49c13aa..bd7353b 100644
--- a/src/libtorch_utils.cc
+++ b/src/libtorch_utils.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-21 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2020-24 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -149,6 +149,19 @@ ParseParameter(
   return nullptr;
 }
 
+TRITONSERVER_Error*
+ParseParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    int* value)
+{
+  std::string value_str;
+  RETURN_IF_ERROR(GetParameterValue(params, mkey, &value_str));
+  RETURN_IF_ERROR(ParseIntValue(value_str, value));
+
+  return nullptr;
+}
+
+
 #ifdef TRITON_ENABLE_GPU
 TRITONSERVER_Error*
 ConvertCUDAStatusToTritonError(
diff --git a/src/libtorch_utils.h b/src/libtorch_utils.h
index e9f9cff..6ec325b 100644
--- a/src/libtorch_utils.h
+++ b/src/libtorch_utils.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -62,4 +62,11 @@ TRITONSERVER_Error* ParseParameter(
     triton::common::TritonJson::Value& params, const std::string& mkey,
     bool* value);
 
+// If the key 'mkey' is present in 'params' then update 'value' with the
+// value associated with that key. If 'mkey' is not present in 'params' then
+// 'value' is set to 'default_value'.
+TRITONSERVER_Error* ParseParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    int* value);
+
 }}}  // namespace triton::backend::pytorch

From 5c97507a8f10ce039fb2c1bfc4fe7635ce35f2a9 Mon Sep 17 00:00:00 2001
From: Tanmay Verma <tanmay2592@gmail.com>
Date: Wed, 24 Apr 2024 12:06:59 -0700
Subject: [PATCH 52/76] Document the thread count options (#126)

* Document the thread count options

* Format fix

* Apply suggestions from code review

Co-authored-by: Jacky <18255193+kthui@users.noreply.github.com>

---------

Co-authored-by: Jacky <18255193+kthui@users.noreply.github.com>
---
 README.md       | 41 +++++++++++++++++++++++++++++++++++++++++
 src/libtorch.cc |  8 ++++----
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b82c774..106eb13 100644
--- a/README.md
+++ b/README.md
@@ -176,6 +176,47 @@ key: "ENABLE_CACHE_CLEANING"
 }
 ```
 
+* `INTER_OP_THREAD_COUNT`:
+
+PyTorch allows using multiple CPU threads during TorchScript model inference.
+One or more inference threads execute a model’s forward pass on the given
+inputs. Each inference thread invokes a JIT interpreter that executes the ops
+of a model inline, one by one. This parameter sets the size of this thread
+pool. The default value of this setting is the number of cpu cores. Please refer
+to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
+document on how to set this parameter properly.
+
+The section of model config file specifying this parameter will look like:
+
+```
+parameters: {
+key: "INTER_OP_THREAD_COUNT"
+    value: {
+    string_value:"1"
+    }
+}
+```
+
+* `INTRA_OP_THREAD_COUNT`:
+
+In addition to the inter-op parallelism, PyTorch can also utilize multiple threads
+within the ops (intra-op parallelism). This can be useful in many cases, including
+element-wise ops on large tensors, convolutions, GEMMs, embedding lookups and
+others. The default value for this setting is the number of CPU cores. Please refer
+to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
+document on how to set this parameter properly.
+
+The section of model config file specifying this parameter will look like:
+
+```
+parameters: {
+key: "INTRA_OP_THREAD_COUNT"
+    value: {
+    string_value:"1"
+    }
+}
+```
+
 * Additional Optimizations: Three additional boolean parameters are available to disable
 certain Torch optimizations that can sometimes cause latency regressions in models with
 complex execution modes and dynamic shapes. If not specified, all are enabled by default.
diff --git a/src/libtorch.cc b/src/libtorch.cc
index 8809206..c6d0b5a 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -476,8 +476,8 @@ ModelState::ParseParameters()
     // is made to 'intra_op_thread_count', which by default will take all
     // threads
     int intra_op_thread_count = -1;
-    err = ParseParameter(
-        params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count);
+    err =
+        ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count);
     if (err != nullptr) {
       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
         return err;
@@ -500,8 +500,8 @@ ModelState::ParseParameters()
     // is made to 'inter_op_thread_count', which by default will take all
     // threads
     int inter_op_thread_count = -1;
-    err = ParseParameter(
-        params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count);
+    err =
+        ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count);
     if (err != nullptr) {
       if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
         return err;

From 7b63f0f67653710fd5fa26f215ee3a7ef5e033cb Mon Sep 17 00:00:00 2001
From: Tanmay Verma <tanmay2592@gmail.com>
Date: Thu, 2 May 2024 14:58:01 -0700
Subject: [PATCH 53/76] Support calling custom method names via
 MODULE_METHOD_NAME (fixes triton-inference-server/server#5209) (#127)

Signed-off-by: Christian Bruckdorfer <christiansvde@freenet.de>
Co-authored-by: Christian Bruckdorfer <christiansvde@freenet.de>
---
 README.md       | 14 +++++++++++
 src/libtorch.cc | 65 +++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 106eb13..63c30a9 100644
--- a/README.md
+++ b/README.md
@@ -217,6 +217,20 @@ key: "INTRA_OP_THREAD_COUNT"
 }
 ```
 
+* `MODULE_METHOD_NAME`:
+
+String flag to specify which method on the PyTorch model is being called.
+Default value is `forward`.
+
+```
+parameters: {
+key: "MODULE_METHOD_NAME"
+    value: {
+    string_value:"custom_method"
+    }
+}
+```
+
 * Additional Optimizations: Three additional boolean parameters are available to disable
 certain Torch optimizations that can sometimes cause latency regressions in models with
 complex execution modes and dynamic shapes. If not specified, all are enabled by default.
diff --git a/src/libtorch.cc b/src/libtorch.cc
index c6d0b5a..969cddf 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -61,6 +61,8 @@
 // https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
 #include <ATen/Parallel.h>
 
+// Default forward method to call on PyTorch modules
+const std::string DEFAULT_MODULE_METHOD_NAME = "forward";
 
 //
 // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
@@ -111,6 +113,7 @@ class ModelState : public BackendModel {
   {
     return model_outputs_;
   }
+  const std::string& ModuleMethodName() { return module_method_name_; }
 
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
@@ -153,6 +156,10 @@ class ModelState : public BackendModel {
   // is specified both in the output section and state section, it indicates
   // that the backend must return the output state to the client too.
   std::map<std::string, std::pair<int64_t, int64_t>> model_outputs_;
+
+  // Method to call on PyTorch Module.
+  // Defaults to DEFAULT_MODULE_METHOD_NAME.
+  std::string module_method_name_;
 };
 
 TRITONSERVER_Error*
@@ -230,7 +237,8 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
       enable_inference_mode_(true), enable_cache_cleaning_(false),
       enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
-      enable_jit_executor_pair_({false, true})
+      enable_jit_executor_pair_({false, true}),
+      module_method_name_(DEFAULT_MODULE_METHOD_NAME)
 {
 }
 
@@ -519,6 +527,30 @@ ModelState::ParseParameters()
                 .c_str());
       }
     }
+
+    // If 'MODULE_METHOD_NAME' is not present in 'parameters' then
+    // 'module_method_name_' is set to 'DEFAULT_MODULE_METHOD_NAME' ('forward').
+    std::string module_method_name = DEFAULT_MODULE_METHOD_NAME;
+    err = GetParameterValue(params, "MODULE_METHOD_NAME", &module_method_name);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("module_method_name is not specified") +
+             " for model instance '" + Name() + "'")
+                .c_str());
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      module_method_name_ = module_method_name;
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("module_method_name is ") + module_method_name_ +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
   }
 
   return nullptr;
@@ -940,7 +972,20 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
   // configuration specifies only those.
   std::vector<std::string> allowed_inputs;
 
-  const torch::jit::Method& method = torch_model_->get_method("forward");
+  // First check if method exists in the model and throw an error if absent
+  const auto methodNameToExecute = model_state_->ModuleMethodName();
+  const auto optionalMethodHandle =
+      torch_model_->find_method(methodNameToExecute);
+  if (!optionalMethodHandle.has_value()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("unable to find method '") + methodNameToExecute +
+         "' in model '" + model_path_ + "'")
+            .c_str());
+  }
+
+  // Get the method schema and validate the inputs
+  const torch::jit::Method& method = optionalMethodHandle.value();
   const auto& schema = method.function().getSchema();
   const std::vector<c10::Argument>& arguments = schema.arguments();
 
@@ -1583,18 +1628,24 @@ ModelInstanceState::Execute(
     torch::NoGradGuard no_grad;
 
     // If input is a dictionary, prepare dictionary from 'input_tensors'.
+    std::string module_method_name = model_state_->ModuleMethodName();
+    std::vector<c10::IValue> inputs;
     if (is_dict_input_) {
-      torch::Dict<std::string, torch::Tensor> input_dict;
+      c10::Dict<std::string, at::Tensor> dict;
       for (auto& input_index : input_index_map_) {
         torch::jit::IValue ival = (*input_tensors)[input_index.second];
-        input_dict.insert(input_index.first, ival.toTensor());
+        dict.insert(input_index.first, ival.toTensor());
       }
-      std::vector<torch::jit::IValue> input_dict_ivalue = {input_dict};
-      model_outputs_ = torch_model_->forward(input_dict_ivalue);
+      inputs.push_back(dict);
     } else {
-      model_outputs_ = torch_model_->forward(*input_tensors);
+      for (auto& input_tensor : *input_tensors) {
+        inputs.push_back(input_tensor.toTensor());
+      }
     }
 
+    // Actually run the method on the model.
+    model_outputs_ = torch_model_->get_method(module_method_name)(inputs);
+
     if (model_outputs_.isTuple()) {
       auto model_outputs_tuple = model_outputs_.toTuple();
       size_t op_index = 0;

From 95128b2639e6471861550e4397347b899e5ee111 Mon Sep 17 00:00:00 2001
From: Tanmay Verma <tanmay2592@gmail.com>
Date: Fri, 3 May 2024 19:01:04 -0700
Subject: [PATCH 54/76] Revert "Support calling custom method names via
 MODULE_METHOD_NAME (fixes triton-inference-server/server#5209) (#127)" (#128)

This reverts commit 7b63f0f67653710fd5fa26f215ee3a7ef5e033cb.
---
 README.md       | 14 -----------
 src/libtorch.cc | 65 ++++++-------------------------------------------
 2 files changed, 7 insertions(+), 72 deletions(-)

diff --git a/README.md b/README.md
index 63c30a9..106eb13 100644
--- a/README.md
+++ b/README.md
@@ -217,20 +217,6 @@ key: "INTRA_OP_THREAD_COUNT"
 }
 ```
 
-* `MODULE_METHOD_NAME`:
-
-String flag to specify which method on the PyTorch model is being called.
-Default value is `forward`.
-
-```
-parameters: {
-key: "MODULE_METHOD_NAME"
-    value: {
-    string_value:"custom_method"
-    }
-}
-```
-
 * Additional Optimizations: Three additional boolean parameters are available to disable
 certain Torch optimizations that can sometimes cause latency regressions in models with
 complex execution modes and dynamic shapes. If not specified, all are enabled by default.
diff --git a/src/libtorch.cc b/src/libtorch.cc
index 969cddf..c6d0b5a 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -61,8 +61,6 @@
 // https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
 #include <ATen/Parallel.h>
 
-// Default forward method to call on PyTorch modules
-const std::string DEFAULT_MODULE_METHOD_NAME = "forward";
 
 //
 // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
@@ -113,7 +111,6 @@ class ModelState : public BackendModel {
   {
     return model_outputs_;
   }
-  const std::string& ModuleMethodName() { return module_method_name_; }
 
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
@@ -156,10 +153,6 @@ class ModelState : public BackendModel {
   // is specified both in the output section and state section, it indicates
   // that the backend must return the output state to the client too.
   std::map<std::string, std::pair<int64_t, int64_t>> model_outputs_;
-
-  // Method to call on PyTorch Module.
-  // Defaults to DEFAULT_MODULE_METHOD_NAME.
-  std::string module_method_name_;
 };
 
 TRITONSERVER_Error*
@@ -237,8 +230,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
       enable_inference_mode_(true), enable_cache_cleaning_(false),
       enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
-      enable_jit_executor_pair_({false, true}),
-      module_method_name_(DEFAULT_MODULE_METHOD_NAME)
+      enable_jit_executor_pair_({false, true})
 {
 }
 
@@ -527,30 +519,6 @@ ModelState::ParseParameters()
                 .c_str());
       }
     }
-
-    // If 'MODULE_METHOD_NAME' is not present in 'parameters' then
-    // 'module_method_name_' is set to 'DEFAULT_MODULE_METHOD_NAME' ('forward').
-    std::string module_method_name = DEFAULT_MODULE_METHOD_NAME;
-    err = GetParameterValue(params, "MODULE_METHOD_NAME", &module_method_name);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_INFO,
-            (std::string("module_method_name is not specified") +
-             " for model instance '" + Name() + "'")
-                .c_str());
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      module_method_name_ = module_method_name;
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("module_method_name is ") + module_method_name_ +
-           " for model instance '" + Name() + "'")
-              .c_str());
-    }
   }
 
   return nullptr;
@@ -972,20 +940,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
   // configuration specifies only those.
   std::vector<std::string> allowed_inputs;
 
-  // First check if method exists in the model and throw an error if absent
-  const auto methodNameToExecute = model_state_->ModuleMethodName();
-  const auto optionalMethodHandle =
-      torch_model_->find_method(methodNameToExecute);
-  if (!optionalMethodHandle.has_value()) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        (std::string("unable to find method '") + methodNameToExecute +
-         "' in model '" + model_path_ + "'")
-            .c_str());
-  }
-
-  // Get the method schema and validate the inputs
-  const torch::jit::Method& method = optionalMethodHandle.value();
+  const torch::jit::Method& method = torch_model_->get_method("forward");
   const auto& schema = method.function().getSchema();
   const std::vector<c10::Argument>& arguments = schema.arguments();
 
@@ -1628,24 +1583,18 @@ ModelInstanceState::Execute(
     torch::NoGradGuard no_grad;
 
     // If input is a dictionary, prepare dictionary from 'input_tensors'.
-    std::string module_method_name = model_state_->ModuleMethodName();
-    std::vector<c10::IValue> inputs;
     if (is_dict_input_) {
-      c10::Dict<std::string, at::Tensor> dict;
+      torch::Dict<std::string, torch::Tensor> input_dict;
       for (auto& input_index : input_index_map_) {
         torch::jit::IValue ival = (*input_tensors)[input_index.second];
-        dict.insert(input_index.first, ival.toTensor());
+        input_dict.insert(input_index.first, ival.toTensor());
       }
-      inputs.push_back(dict);
+      std::vector<torch::jit::IValue> input_dict_ivalue = {input_dict};
+      model_outputs_ = torch_model_->forward(input_dict_ivalue);
     } else {
-      for (auto& input_tensor : *input_tensors) {
-        inputs.push_back(input_tensor.toTensor());
-      }
+      model_outputs_ = torch_model_->forward(*input_tensors);
     }
 
-    // Actually run the method on the model.
-    model_outputs_ = torch_model_->get_method(module_method_name)(inputs);
-
     if (model_outputs_.isTuple()) {
       auto model_outputs_tuple = model_outputs_.toTuple();
       size_t op_index = 0;

From 44d84bd8b145d8e8c4313b06b49e4c744ba2b130 Mon Sep 17 00:00:00 2001
From: Jacky <18255193+kthui@users.noreply.github.com>
Date: Tue, 7 May 2024 12:25:44 -0700
Subject: [PATCH 55/76] Update the Conda installation command to match the
 latest from PyTorch (#129)

---
 tools/gen_pb_exec_env.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/gen_pb_exec_env.sh b/tools/gen_pb_exec_env.sh
index f89f0e2..d7fe86b 100755
--- a/tools/gen_pb_exec_env.sh
+++ b/tools/gen_pb_exec_env.sh
@@ -41,7 +41,7 @@ export PYTHONNOUSERSITE=True
 conda install -c conda-forge libstdcxx-ng=12 -y
 
 # install PyTorch
-conda install pytorch torchvision torchaudio pytorch-cuda -c pytorch -c nvidia -y
+conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
 
 # pack environment
 rm -f pb_exec_env_model.py.tar.gz

From 2704676cf11b62d0e9a388afe8dec657c5bf1e0c Mon Sep 17 00:00:00 2001
From: Tanmay Verma <tanmay2592@gmail.com>
Date: Wed, 29 May 2024 12:18:30 -0700
Subject: [PATCH 56/76] Add NVPL libraries to pytorch installation (#130)
 (#131)

---
 CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 438e53e..315f036 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -162,6 +162,16 @@ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
   set(LIBS_ARCH "aarch64")
   set(LIBTORCH_LIBS
       "libopenblas.so.0"
+      "libnvpl_blas_core.so.0"
+      "libnvpl_blas_ilp64_gomp.so.0"
+      "libnvpl_blas_ilp64_seq.so.0"
+      "libnvpl_blas_lp64_gomp.so.0"
+      "libnvpl_blas_lp64_seq.so.0"
+      "libnvpl_lapack_core.so.0"
+      "libnvpl_lapack_ilp64_gomp.so.0"
+      "libnvpl_lapack_ilp64_seq.so.0"
+      "libnvpl_lapack_lp64_gomp.so.0"
+      "libnvpl_lapack_lp64_seq.so.0"
   )
 else()
   set(LIBS_ARCH "x86_64")

From 8d14a80e84c99fded1f64b545ca023d8a3c88cea Mon Sep 17 00:00:00 2001
From: Jamie Dougherty <jamied157@gmail.com>
Date: Wed, 5 Jun 2024 17:33:34 +0100
Subject: [PATCH 57/76] Disable cudnn option (#123)

* add disable cudnn option

* correct comment

* clang format

* add to readme

---------

Co-authored-by: jamied <jamied@speechmatics.com>
---
 README.md       | 20 ++++++++++++++++++++
 src/libtorch.cc | 30 ++++++++++++++++++++++++++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 106eb13..731a7c3 100644
--- a/README.md
+++ b/README.md
@@ -144,6 +144,26 @@ key: "INFERENCE_MODE"
 }
 ```
 
+* `DISABLE_CUDNN`: Boolean flag to disable the cuDNN library. By default, cuDNN is enabled.
+
+[cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for 
+deep neural networks. cuDNN provides highly tuned implementations for standard routines.
+
+Typically, models run with cuDNN enabled are faster. However there are some exceptions
+where using cuDNN can be slower, cause higher memory usage or result in errors. 
+
+
+The section of model config file specifying this parameter will look like:
+
+```
+parameters: {
+key: "DISABLE_CUDNN"
+    value: {
+    string_value: "true"
+    }
+}
+```
+
 * `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to
 share weights. This optimization should not be used with stateful models. If not specified,
 weight sharing is disabled.
diff --git a/src/libtorch.cc b/src/libtorch.cc
index c6d0b5a..dbea502 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -104,6 +104,7 @@ class ModelState : public BackendModel {
     return enable_jit_executor_pair_;
   }
   bool EnabledInferenceMode() { return enable_inference_mode_; }
+  bool EnabledCudnn() { return enable_cudnn_; }
   bool EnabledCacheCleaning() { return enable_cache_cleaning_; }
 
   bool EnabledWeightSharing() { return enable_weight_sharing_; }
@@ -125,6 +126,9 @@ class ModelState : public BackendModel {
   // Flag to indicate whether inference mode is enabled. Defaults to false.
   bool enable_inference_mode_;
 
+  // Flag to indicate whether cudnn is enabled. Defaults to true.
+  bool enable_cudnn_;
+
   // Flag to indicate whether cache cleaning after each run is enabled.
   // Defaults to false.
   bool enable_cache_cleaning_;
@@ -227,8 +231,9 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model), enable_optimized_execution_(true),
-      enable_inference_mode_(true), enable_cache_cleaning_(false),
-      enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}),
+      enable_inference_mode_(true), enable_cudnn_(true),
+      enable_cache_cleaning_(false), enable_weight_sharing_(false),
+      enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
       enable_jit_executor_pair_({false, true})
 {
@@ -393,6 +398,24 @@ ModelState::ParseParameters()
          " for model instance '" + Name() + "'")
             .c_str());
 
+    // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made
+    // to 'enable_cudnn_'.
+    bool disable_cudnn = false;
+    err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+    enable_cudnn_ = !disable_cudnn;
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
     // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no
     // update is made to 'enable_tensor_fuser'.
     bool enable_tensor_fuser = false;
@@ -1562,6 +1585,9 @@ ModelInstanceState::Execute(
     // enable/disable inference mode - supersedes NoGradGuard
     torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
 
+    // enable/disable cudnn
+    at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn());
+
     // JIT. No change is made unless parameter is explicitly set.
     if (std::get<0>(model_state_->EnabledJitProfiling())) {
       torch::jit::getProfilingMode() =

From c852a5e756d79a22f27bbf7648108739100b63e3 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Thu, 27 Jun 2024 19:15:53 -0700
Subject: [PATCH 58/76] Update `main` post 24.06 (#135)

* Add cusparseLt in the installation to support 24.06 (#132)

* Add cusparseLt in the installation to support 24.06

* Fix the arm build

* Fix library paths on ARM-SBSA (#133)

* Add cusparseLt in the installation to support 24.06

* Fix the arm build

* Fix the lib path

* Fix location of the dependency library. (#134)

---------

Co-authored-by: Tanmay Verma <tanmay2592@gmail.com>
---
 CMakeLists.txt | 4 ++++
 README.md      | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 315f036..07744ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -229,6 +229,8 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
+    # TODO: Revisit when not needed by making it part of cuda base container.
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/cuda/lib64/libcusparseLt.so libcusparseLt.so
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
     COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
@@ -434,6 +436,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
   install(
     FILES
       ${PT_LIB_PATHS}
+      ${CMAKE_CURRENT_BINARY_DIR}/libcusparseLt.so
       ${CMAKE_CURRENT_BINARY_DIR}/LICENSE.pytorch
     DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch
   )
@@ -474,6 +477,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
         COMMAND ln -sf libopencv_flann.so libopencv_flann.so.${OPENCV_VERSION}
         COMMAND ln -sf libpng16.so libpng16.so.16
         COMMAND ln -sf libjpeg.so libjpeg.so.8
+        COMMAND ln -sf libcusparseLt.so libcusparseLt.so.0
         RESULT_VARIABLE LINK_STATUS
         WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
       if(LINK_STATUS AND NOT LINK_STATUS EQUAL 0)
diff --git a/README.md b/README.md
index 731a7c3..8ed211f 100644
--- a/README.md
+++ b/README.md
@@ -146,11 +146,11 @@ key: "INFERENCE_MODE"
 
 * `DISABLE_CUDNN`: Boolean flag to disable the cuDNN library. By default, cuDNN is enabled.
 
-[cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for 
+[cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for
 deep neural networks. cuDNN provides highly tuned implementations for standard routines.
 
 Typically, models run with cuDNN enabled are faster. However there are some exceptions
-where using cuDNN can be slower, cause higher memory usage or result in errors. 
+where using cuDNN can be slower, cause higher memory usage or result in errors.
 
 
 The section of model config file specifying this parameter will look like:

From 0d76fbf3be8cd60bfc2a656a386561a9e6528dc2 Mon Sep 17 00:00:00 2001
From: Yingge He <157551214+yinggeh@users.noreply.github.com>
Date: Wed, 31 Jul 2024 09:02:59 -0700
Subject: [PATCH 59/76] refactor: Add string input checks (#136)

Add string input tensor checks
---
 src/libtorch.cc | 68 ++++++++-----------------------------------------
 1 file changed, 11 insertions(+), 57 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index dbea502..c4e964c 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1911,7 +1911,6 @@ SetStringInputTensor(
     cudaStream_t stream, const char* host_policy_name)
 {
   bool cuda_copy = false;
-  size_t element_idx = 0;
 
   // For string data type, we always need to have the data on CPU so
   // that we can read string length and construct the string
@@ -1926,7 +1925,7 @@ SetStringInputTensor(
       stream, &cuda_copy);
   if (err != nullptr) {
     RESPOND_AND_SET_NULL_IF_ERROR(response, err);
-    FillStringTensor(input_list, request_element_cnt - element_idx);
+    FillStringTensor(input_list, request_element_cnt);
     return cuda_copy;
   }
 
@@ -1937,64 +1936,19 @@ SetStringInputTensor(
   }
 #endif  // TRITON_ENABLE_GPU
 
-  // Parse content and assign to 'tensor'. Each string in 'content'
-  // is a 4-byte length followed by the string itself with no
-  // null-terminator.
-  while (content_byte_size >= sizeof(uint32_t)) {
-    if (element_idx >= request_element_cnt) {
-      RESPOND_AND_SET_NULL_IF_ERROR(
-          response,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              std::string(
-                  "unexpected number of string elements " +
-                  std::to_string(element_idx + 1) + " for inference input '" +
-                  name + "', expecting " + std::to_string(request_element_cnt))
-                  .c_str()));
-      return cuda_copy;
-    }
-
-    const uint32_t len = *(reinterpret_cast<const uint32_t*>(content));
-    content += sizeof(uint32_t);
-    content_byte_size -= sizeof(uint32_t);
-
-    if (content_byte_size < len) {
-      RESPOND_AND_SET_NULL_IF_ERROR(
-          response,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              std::string(
-                  "incomplete string data for inference input '" +
-                  std::string(name) + "', expecting string of length " +
-                  std::to_string(len) + " but only " +
-                  std::to_string(content_byte_size) + " bytes available")
-                  .c_str()));
-      FillStringTensor(input_list, request_element_cnt - element_idx);
-      return cuda_copy;
-    }
-
-    // Set string value
-    input_list->push_back(std::string(content, len));
-
-    content += len;
-    content_byte_size -= len;
-    element_idx++;
+  std::vector<std::pair<const char*, const uint32_t>> str_list;
+  err = ValidateStringBuffer(
+      content, content_byte_size, request_element_cnt, name, &str_list);
+  // Set string values.
+  for (const auto& [addr, len] : str_list) {
+    input_list->push_back(std::string(addr, len));
   }
 
-  if ((*response != nullptr) && (element_idx != request_element_cnt)) {
-    RESPOND_AND_SET_NULL_IF_ERROR(
-        response, TRITONSERVER_ErrorNew(
-                      TRITONSERVER_ERROR_INTERNAL,
-                      std::string(
-                          "expected " + std::to_string(request_element_cnt) +
-                          " strings for inference input '" + name + "', got " +
-                          std::to_string(element_idx))
-                          .c_str()));
-    if (element_idx < request_element_cnt) {
-      FillStringTensor(input_list, request_element_cnt - element_idx);
-    }
+  size_t element_cnt = str_list.size();
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    FillStringTensor(input_list, request_element_cnt - element_cnt);
   }
-
   return cuda_copy;
 }
 

From a520106eca034502b9eba39b563902020a9bdda4 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Tue, 27 Aug 2024 09:35:24 -0700
Subject: [PATCH 60/76] build: RHEL8 PyTorch Backend (#137)

* Pytorch Backend Manylinux Support
---
 CMakeLists.txt | 64 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 07744ac..77d22bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,7 +84,19 @@ else()
 endif()
 
 # Python.h needed by torch headers.
-find_package(Python3 REQUIRED COMPONENTS Development)
+find_package(Python3 REQUIRED COMPONENTS Development.Module)
+
+set(RHEL_BUILD OFF)
+set(LIB_DIR "lib")
+set(PY_INSTALL_PATH "/usr/local/lib/python3.10/dist-packages")
+if(LINUX)
+  file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
+  if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+    set(RHEL_BUILD ON)
+    set(LIB_DIR "lib64")
+    set(PY_INSTALL_PATH "/opt/_internal/cpython-3.10.13/lib/python3.10/site-packages")
+  endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+endif(LINUX)
 
 #
 # Dependencies
@@ -220,35 +232,35 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE}
     COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true
     COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE}
-    COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/usr/local/lib/$i $i ; done"
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so libc10.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libc10_cuda.so libc10_cuda.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch.so libtorch.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
+    COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:$<IF:$<BOOL:${RHEL_BUILD}>,/opt/_internal/cpython-3.10.13/lib/$i,/usr/local/lib/$i> $i ; done"
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10.so libc10.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10_cuda.so libc10_cuda.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch.so libtorch.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cpu.so libtorch_cpu.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda.so libtorch_cuda.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
     # TODO: Revisit when not needed by making it part of cuda base container.
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/cuda/lib64/libcusparseLt.so libcusparseLt.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/libtorchvision.so libtorchvision.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/cuda/lib64/libcusparseLt.so libcusparseLt.so;
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so libtorchvision.so; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch
-    COMMAND docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch/include include/torch
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/include include/torch
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/.
-    COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_videoio.so libopencv_videoio.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_highgui.so libopencv_highgui.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_video.so libopencv_video.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_imgcodecs.so libopencv_imgcodecs.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_imgproc.so libopencv_imgproc.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_core.so libopencv_core.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_calib3d.so libopencv_calib3d.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_features2d.so libopencv_features2d.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libopencv_flann.so libopencv_flann.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0 libpng16.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_videoio.so libopencv_videoio.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_highgui.so libopencv_highgui.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_video.so libopencv_video.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_imgcodecs.so libopencv_imgcodecs.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_imgproc.so libopencv_imgproc.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_core.so libopencv_core.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_calib3d.so libopencv_calib3d.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_features2d.so libopencv_features2d.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_flann.so libopencv_flann.so
+    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:$<IF:$<BOOL:${RHEL_BUILD}>,/usr/lib64/libpng16.so.16.34.0,/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0> libpng16.so"
+    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:$<IF:$<BOOL:${RHEL_BUILD}>,/usr/lib64/libjpeg.so.62.2.0,/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2> libjpeg.so"
     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi"

From e97a90061f7399b25368e2cb1f6d5c40f8cbae4e Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Wed, 11 Sep 2024 16:28:07 -0700
Subject: [PATCH 61/76] RHEL Pytorch Backend SBSA (#139)

---
 CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77d22bb..fd654c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,7 @@ find_package(Python3 REQUIRED COMPONENTS Development.Module)
 
 set(RHEL_BUILD OFF)
 set(LIB_DIR "lib")
+set(LIBTORCH_LIBS_PATH "/usr/local/lib")
 set(PY_INSTALL_PATH "/usr/local/lib/python3.10/dist-packages")
 if(LINUX)
   file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
@@ -95,6 +96,9 @@ if(LINUX)
     set(RHEL_BUILD ON)
     set(LIB_DIR "lib64")
     set(PY_INSTALL_PATH "/opt/_internal/cpython-3.10.13/lib/python3.10/site-packages")
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+      set(LIBTORCH_LIBS_PATH "/opt/_internal/cpython-3.10.13/lib")
+    endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
   endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
 endif(LINUX)
 
@@ -232,7 +236,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE}
     COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true
     COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE}
-    COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:$<IF:$<BOOL:${RHEL_BUILD}>,/opt/_internal/cpython-3.10.13/lib/$i,/usr/local/lib/$i> $i ; done"
+    COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:${LIBTORCH_LIBS_PATH}/$i $i ; done"
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10.so libc10.so
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10_cuda.so libc10_cuda.so
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch.so libtorch.so

From a45527e0ca999211e3ebda2e5b4bd63ebb9d2470 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Tue, 26 Nov 2024 15:32:29 -0800
Subject: [PATCH 62/76] Update `main` branch post 24.11 (#143)

* Enable support for Ubuntu 24.04 (#141)

* Update Python version

* Update version for ligpng

* Set proper path

* Update pytorch libraryr destination

* Update copy file configuration

* Remov interactive flag

* Update CMake

* Update match pattern to pick shared object file only

* test: Upgrade Miniconda version for ubuntu 24.04 (#142)

* Update miniconda version

* For testing

* Try with python 3.12

---------

Co-authored-by: Kris Hung <krish@nvidia.com>
---
 CMakeLists.txt           | 16 ++++++++++------
 tools/gen_pb_exec_env.sh |  9 ++++-----
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd654c8..ebb48ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,15 +89,15 @@ find_package(Python3 REQUIRED COMPONENTS Development.Module)
 set(RHEL_BUILD OFF)
 set(LIB_DIR "lib")
 set(LIBTORCH_LIBS_PATH "/usr/local/lib")
-set(PY_INSTALL_PATH "/usr/local/lib/python3.10/dist-packages")
+set(PY_INSTALL_PATH "/usr/local/lib/python3.12/dist-packages")
 if(LINUX)
   file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
   if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
     set(RHEL_BUILD ON)
     set(LIB_DIR "lib64")
-    set(PY_INSTALL_PATH "/opt/_internal/cpython-3.10.13/lib/python3.10/site-packages")
+    set(PY_INSTALL_PATH "/opt/_internal/cpython-3.12.1/lib/python3.12/site-packages")
     if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
-      set(LIBTORCH_LIBS_PATH "/opt/_internal/cpython-3.10.13/lib")
+      set(LIBTORCH_LIBS_PATH "/opt/_internal/cpython-3.12.1/lib")
     endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
   endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
 endif(LINUX)
@@ -223,6 +223,7 @@ set(OPENCV_LIBS
 # in some cases leads to segmentation faults.
 if (${TRITON_PYTORCH_DOCKER_BUILD})
   string(REPLACE ";" " " LIBTORCH_LIBS_STR "${LIBTORCH_LIBS}")
+  string(RANDOM 8 "abcdefghijklmnopqrstuvwxyz" random_id)
 
   add_custom_command(
     OUTPUT
@@ -249,7 +250,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/cuda/lib64/libcusparseLt.so libcusparseLt.so;
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so libtorchvision.so; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi"
-    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/include include/torch
@@ -263,8 +264,10 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_calib3d.so libopencv_calib3d.so
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_features2d.so libopencv_features2d.so
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_flann.so libopencv_flann.so
-    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:$<IF:$<BOOL:${RHEL_BUILD}>,/usr/lib64/libpng16.so.16.34.0,/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0> libpng16.so"
-    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:$<IF:$<BOOL:${RHEL_BUILD}>,/usr/lib64/libjpeg.so.62.2.0,/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2> libjpeg.so"
+    COMMAND /bin/sh -c "docker run --name libjpeg-${random_id} ${TRITON_PYTORCH_DOCKER_IMAGE} find /usr/lib64 /usr/local/lib/python3.12/dist-packages/torchvision.libs/ -name libjpeg*.so* -type f -exec cp -v {} /tmp/libjpeg.so \\; || true"
+    COMMAND docker cp libjpeg-${random_id}:/tmp/libjpeg.so libjpeg.so
+    COMMAND /bin/sh -c "docker run --name libpng-${random_id} ${TRITON_PYTORCH_DOCKER_IMAGE} find /usr/lib64 /usr/local/lib/python3.12/dist-packages/torchvision.libs/ -name libpng*.so* -type f -exec cp -v {} /tmp/libpng16.so \\; || true"
+    COMMAND docker cp libpng-${random_id}:/tmp/libpng16.so libpng16.so
     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi"
@@ -493,6 +496,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
         COMMAND ln -sf libopencv_flann.so libopencv_flann.so.${OPENCV_VERSION}
         COMMAND ln -sf libpng16.so libpng16.so.16
         COMMAND ln -sf libjpeg.so libjpeg.so.8
+        COMMAND ln -sf libjpeg.so libjpeg.so.62
         COMMAND ln -sf libcusparseLt.so libcusparseLt.so.0
         RESULT_VARIABLE LINK_STATUS
         WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
diff --git a/tools/gen_pb_exec_env.sh b/tools/gen_pb_exec_env.sh
index d7fe86b..d01a0d9 100755
--- a/tools/gen_pb_exec_env.sh
+++ b/tools/gen_pb_exec_env.sh
@@ -27,18 +27,18 @@
 
 # install conda
 rm -rf ./miniconda
-wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh
-bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh -p ./miniconda -b
+wget https://repo.anaconda.com/miniconda/Miniconda3-py312_24.9.2-0-Linux-x86_64.sh
+bash Miniconda3-py312_24.9.2-0-Linux-x86_64.sh -p ./miniconda -b
 eval "$(./miniconda/bin/conda shell.bash hook)"
 
 # create conda environment
-conda create -n pt python=3.10 -y
+conda create -n pt python=3.12 -y
 conda activate pt
 conda install -c conda-forge conda-pack -y
 
 # pre install step
 export PYTHONNOUSERSITE=True
-conda install -c conda-forge libstdcxx-ng=12 -y
+conda install -c conda-forge libstdcxx-ng=14 -y
 
 # install PyTorch
 conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
@@ -49,4 +49,3 @@ conda pack -o pb_exec_env_model.py.tar.gz
 
 # deactivate conda
 conda deactivate
-conda deactivate

From cd2dc94e2f63430270af099c890f7a2921ee667a Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Mon, 3 Feb 2025 17:30:42 -0800
Subject: [PATCH 63/76] Update default branch post 25.01 (#147)

* Update the symlink (#144)

* Downgrade patchelf version from 0.18.0 to 0.17.2 due to patchelf regression (#145)

Patchelf shipped a regression in 0.18.0 and has since yanked the pypi release pointing to
0.17.2 as the most recent version. However, 0.18.0 is still the version shipped in both the apt and yum
repositories, thus we must use pip to install the version we want.
See https://github.com/mayeut/patchelf-pypi/issues/87

---------

Co-authored-by: Pavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com>
Co-authored-by: Kyle McGill <101670481+nv-kmcgill53@users.noreply.github.com>
---
 CMakeLists.txt | 19 ++++++++++---------
 README.md      |  3 ++-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ebb48ff..46390a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,7 @@ else()
   endif()
 
     # Look for installed Torchvision package in lib paths
-  if(TRITON_PYTORCH_ENABLE_TORCHVISION AND NOT EXISTS "${TRITON_PYTORCH_LIB_PATHS}/libtorchvision.so")
+  if(TRITON_PYTORCH_ENABLE_TORCHVISION AND NOT EXISTS "${TRITON_PYTORCH_LIB_PATHS}/libtorchvision.so.1")
     message(WARNING "TRITON_PYTORCH_ENABLE_TORCHVISION is on, but TRITON_PYTORCH_LIB_PATHS does not contain Torchvision package")
   endif()
 endif()
@@ -158,12 +158,13 @@ set(PT_LIBS
     "libtorch_cuda.so"
     "libtorch_cuda_linalg.so"
     "libtorch_global_deps.so"
+    "libjpeg.so.62"
 )
 
 if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
   set(PT_LIBS
       ${PT_LIBS}
-      "libtorchvision.so"
+      "libtorchvision.so.1"
   )
 endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
 
@@ -248,7 +249,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
     # TODO: Revisit when not needed by making it part of cuda base container.
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/cuda/lib64/libcusparseLt.so libcusparseLt.so;
-    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so libtorchvision.so; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp  -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
@@ -264,10 +265,10 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_calib3d.so libopencv_calib3d.so
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_features2d.so libopencv_features2d.so
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_flann.so libopencv_flann.so
-    COMMAND /bin/sh -c "docker run --name libjpeg-${random_id} ${TRITON_PYTORCH_DOCKER_IMAGE} find /usr/lib64 /usr/local/lib/python3.12/dist-packages/torchvision.libs/ -name libjpeg*.so* -type f -exec cp -v {} /tmp/libjpeg.so \\; || true"
-    COMMAND docker cp libjpeg-${random_id}:/tmp/libjpeg.so libjpeg.so
-    COMMAND /bin/sh -c "docker run --name libpng-${random_id} ${TRITON_PYTORCH_DOCKER_IMAGE} find /usr/lib64 /usr/local/lib/python3.12/dist-packages/torchvision.libs/ -name libpng*.so* -type f -exec cp -v {} /tmp/libpng16.so \\; || true"
-    COMMAND docker cp libpng-${random_id}:/tmp/libpng16.so libpng16.so
+    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62
+    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.43.0 libpng16.so"
+    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so"
+    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62 || docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.62 libjpeg.so.62"
     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi"
@@ -278,6 +279,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_intel_thread.so.1 libmkl_vml_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_vml_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so.1 ]; then patchelf --add-needed libmkl_intel_lp64.so.1 libmkl_intel_thread.so.1; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then ln -s libtorchvision.so.1 libtorchvision.so; fi"
     COMMAND docker rm pytorch_backend_ptlib
     COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}"
     VERBATIM
@@ -382,7 +384,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
   if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
     set(TRITON_PYTORCH_LIBS
         ${TRITON_PYTORCH_LIBS}
-        "${CMAKE_CURRENT_BINARY_DIR}/libtorchvision.so")
+        "${CMAKE_CURRENT_BINARY_DIR}/libtorchvision.so.1")
   endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
 
   if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
@@ -496,7 +498,6 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
         COMMAND ln -sf libopencv_flann.so libopencv_flann.so.${OPENCV_VERSION}
         COMMAND ln -sf libpng16.so libpng16.so.16
         COMMAND ln -sf libjpeg.so libjpeg.so.8
-        COMMAND ln -sf libjpeg.so libjpeg.so.62
         COMMAND ln -sf libcusparseLt.so libcusparseLt.so.0
         RESULT_VARIABLE LINK_STATUS
         WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
diff --git a/README.md b/README.md
index 8ed211f..2b27283 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,8 @@ main Triton [issues page](https://github.com/triton-inference-server/server/issu
 Use a recent cmake to build. First install the required dependencies.
 
 ```
-$ apt-get install patchelf rapidjson-dev python3-dev
+$ apt-get install rapidjson-dev python3-dev python3-pip
+$ pip3 install patchelf==0.17.2
 ```
 
 An appropriate PyTorch container from [NGC](https://ngc.nvidia.com) must be used.

From e3244bcedae5a24a4a64987cc52741b42666ed96 Mon Sep 17 00:00:00 2001
From: Pavithra Vijayakrishnan
 <160681768+pvijayakrish@users.noreply.github.com>
Date: Wed, 5 Mar 2025 18:09:58 -0800
Subject: [PATCH 64/76] Fix RHEL build

* Update the RHEL library path

* Remove the ternsary operator in cmake

* Update the libpng and libjpeg paths

* Update the libpng and libjpeg paths

* Add -L option for docker cp

* Add missing space

* Update the libjpeg installation for rhel

* Update the libpng installation for rhel

* Update changes to opencv path

* Revert "Update changes to opencv path"

This reverts commit f51125800bfa767b374aad0a5520f489326e5092.

* Remove commented line

* Updating the opencv libraries for non RHEL build

* Using find_library to verify Torchvision package

find_library() function in CMake is used to search for a library in specific directories and return its path if found.

Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>

* Remove incorrect end if

The end_if needs to be removed as the associated if statement was removed in a previous commit.

* Use generator expressions

Use generator expressions instead of if-else.

* Fix typo

* Fix typo

* Update RHEL_BUILD used in condition

* fixing variable expansion

* Remove trailing white space.

---------

Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
---
 CMakeLists.txt | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46390a2..313b85d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,9 +78,10 @@ else()
   endif()
 
     # Look for installed Torchvision package in lib paths
-  if(TRITON_PYTORCH_ENABLE_TORCHVISION AND NOT EXISTS "${TRITON_PYTORCH_LIB_PATHS}/libtorchvision.so.1")
+  find_library( LIBTORCHVISION libtorchvision.so libtorchvision.so.1 PATHS ${TRITON_PYTORCH_LIB_PATHS} )
+  if(NOT ${LIBTORCHVISION})
     message(WARNING "TRITON_PYTORCH_ENABLE_TORCHVISION is on, but TRITON_PYTORCH_LIB_PATHS does not contain Torchvision package")
-  endif()
+  endif(NOT ${LIBTORCHVISION})
 endif()
 
 # Python.h needed by torch headers.
@@ -164,7 +165,7 @@ set(PT_LIBS
 if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
   set(PT_LIBS
       ${PT_LIBS}
-      "libtorchvision.so.1"
+      $<IF:$<BOOL:${RHEL_BUILD}>,libtorchvision.so,libtorchvision.so.1>
   )
 endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
 
@@ -215,8 +216,8 @@ set(OPENCV_LIBS
     "libopencv_calib3d.so"
     "libopencv_flann.so"
     "libopencv_features2d.so"
-    "libpng16.so"
-    "libjpeg.so"
+    $<IF:$<BOOL:${RHEL_BUILD}>,libjpeg.so.62,libjpeg.so>
+    $<IF:$<BOOL:${RHEL_BUILD}>,libpng16.so.16,libpng16.so>
 )
 
 # The patchelf commands ensure the MKL libraries are loaded correctly during runtime
@@ -249,7 +250,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
     # TODO: Revisit when not needed by making it part of cuda base container.
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/cuda/lib64/libcusparseLt.so libcusparseLt.so;
-    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp  -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -a -L pytorch_backend_ptlib:/usr/local/lib64/libtorchvision.so libtorchvision.so; else docker cp -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
@@ -265,10 +266,8 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_calib3d.so libopencv_calib3d.so
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_features2d.so libopencv_features2d.so
     COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_flann.so libopencv_flann.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62
-    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.43.0 libpng16.so"
-    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so"
-    COMMAND /bin/sh -c "docker cp pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62 || docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.62 libjpeg.so.62"
+    COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libjpeg.so.62 libjpeg.so.62; else docker cp -L pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62 && docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so; fi;"
+    COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libpng16.so.16 libpng16.so.16; else docker cp -L pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so libpng16.so; fi;"
     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi"
@@ -279,7 +278,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_intel_thread.so.1 libmkl_vml_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_vml_def.so.1; fi"
     COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so.1 ]; then patchelf --add-needed libmkl_intel_lp64.so.1 libmkl_intel_thread.so.1; fi"
-    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then ln -s libtorchvision.so.1 libtorchvision.so; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'OFF' ]; then ln -s libtorchvision.so.1 libtorchvision.so; fi; fi;"
     COMMAND docker rm pytorch_backend_ptlib
     COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}"
     VERBATIM
@@ -383,8 +382,8 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
 
   if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
     set(TRITON_PYTORCH_LIBS
-        ${TRITON_PYTORCH_LIBS}
-        "${CMAKE_CURRENT_BINARY_DIR}/libtorchvision.so.1")
+    ${TRITON_PYTORCH_LIBS}
+    "${CMAKE_CURRENT_BINARY_DIR}/$<IF:$<BOOL:${RHEL_BUILD}>,libtorchvision.so,libtorchvision.so.1>")
   endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
 
   if (${TRITON_PYTORCH_ENABLE_TORCHTRT})

From 6fba1077d6479a5e1be5e97bf51dc0a75536e0bd Mon Sep 17 00:00:00 2001
From: Yingge He <157551214+yinggeh@users.noreply.github.com>
Date: Wed, 7 May 2025 07:57:00 -0400
Subject: [PATCH 65/76] refactor: Use safer backend APIs (#149)

---
 src/libtorch.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/libtorch.cc b/src/libtorch.cc
index c4e964c..26a2960 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -2105,7 +2105,11 @@ ModelInstanceState::SetInputTensors(
                                       input, nullptr, nullptr, &input_shape,
                                       &input_dims_count, nullptr, nullptr));
 
-        batchn_shape[0] += GetElementCount(input_shape, input_dims_count);
+        int64_t element_cnt = 0;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            GetElementCount(input_shape, input_dims_count, &element_cnt));
+        batchn_shape[0] += element_cnt;
       }
     } else {
       batchn_shape =
@@ -2160,7 +2164,10 @@ ModelInstanceState::SetInputTensors(
                 input, HostPolicyName().c_str(), nullptr, nullptr, &shape,
                 &dims_count, nullptr, &buffer_count));
 
-        const int64_t batch_element_cnt = GetElementCount(shape, dims_count);
+        int64_t batch_element_cnt = 0;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            GetElementCount(shape, dims_count, &batch_element_cnt));
 
         *cuda_copy |= SetStringInputTensor(
             &input_list, input, input_name, buffer_count, batch_element_cnt,
@@ -2347,7 +2354,8 @@ ModelInstanceState::ReadOutputTensors(
           batchn_shape[0] = shape[0];
         }
 
-        const size_t tensor_element_cnt = GetElementCount(batchn_shape);
+        int64_t tensor_element_cnt = 0;
+        RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt));
 
         // Only need an response tensor for requested outputs.
         if (response != nullptr) {

From 00569b7317c1850904b4ff050c0532bb56a78f72 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Sat, 28 Jun 2025 08:57:03 -0700
Subject: [PATCH 66/76] Update CMake configuration to support latest release
 (#151) (#152)

* Remove 'libopencv' dependencies

* Removing 'libcusparseLt.so' it appears a part of the devel image

* Updating name of the variabl to presue it's purpose

* Remove 'libcusparseLt.so'

* Remove 'libopenblas.so.0' from dependency list
---
 CMakeLists.txt | 44 ++++++--------------------------------------
 1 file changed, 6 insertions(+), 38 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 313b85d..a50f667 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,7 +179,6 @@ endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
 if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
   set(LIBS_ARCH "aarch64")
   set(LIBTORCH_LIBS
-      "libopenblas.so.0"
       "libnvpl_blas_core.so.0"
       "libnvpl_blas_ilp64_gomp.so.0"
       "libnvpl_blas_ilp64_seq.so.0"
@@ -206,16 +205,7 @@ else()
     "libmkl_vml_def.so.1"
   )
 endif()
-set(OPENCV_LIBS
-    "libopencv_video.so"
-    "libopencv_videoio.so"
-    "libopencv_highgui.so"
-    "libopencv_imgcodecs.so"
-    "libopencv_imgproc.so"
-    "libopencv_core.so"
-    "libopencv_calib3d.so"
-    "libopencv_flann.so"
-    "libopencv_features2d.so"
+set(TORCHVISION_LIBS
     $<IF:$<BOOL:${RHEL_BUILD}>,libjpeg.so.62,libjpeg.so>
     $<IF:$<BOOL:${RHEL_BUILD}>,libpng16.so.16,libpng16.so>
 )
@@ -231,7 +221,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     OUTPUT
       ${PT_LIBS}
       ${LIBTORCH_LIBS}
-      ${OPENCV_LIBS}
+      ${TORCHVISION_LIBS}
       LICENSE.pytorch
       include/torch
       include/torchvision
@@ -248,8 +238,6 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
-    # TODO: Revisit when not needed by making it part of cuda base container.
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/cuda/lib64/libcusparseLt.so libcusparseLt.so;
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -a -L pytorch_backend_ptlib:/usr/local/lib64/libtorchvision.so libtorchvision.so; else docker cp -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
@@ -257,15 +245,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/include include/torch
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/.
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_videoio.so libopencv_videoio.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_highgui.so libopencv_highgui.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_video.so libopencv_video.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_imgcodecs.so libopencv_imgcodecs.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_imgproc.so libopencv_imgproc.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_core.so libopencv_core.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_calib3d.so libopencv_calib3d.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_features2d.so libopencv_features2d.so
-    COMMAND docker cp -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libopencv_flann.so libopencv_flann.so
+
     COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libjpeg.so.62 libjpeg.so.62; else docker cp -L pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62 && docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so; fi;"
     COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libpng16.so.16 libpng16.so.16; else docker cp -L pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so libpng16.so; fi;"
     COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi"
@@ -283,7 +263,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}"
     VERBATIM
   )
-  add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS})
+  add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS})
   add_library(ptlib SHARED IMPORTED GLOBAL)
   add_dependencies(ptlib ptlib_target)
 
@@ -449,14 +429,13 @@ install(
 
 if (${TRITON_PYTORCH_DOCKER_BUILD})
   set(PT_LIB_PATHS "")
-  FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS})
+  FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS})
     set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}")
   ENDFOREACH(plib)
 
   install(
     FILES
       ${PT_LIB_PATHS}
-      ${CMAKE_CURRENT_BINARY_DIR}/libcusparseLt.so
       ${CMAKE_CURRENT_BINARY_DIR}/LICENSE.pytorch
     DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch
   )
@@ -469,7 +448,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     )
   endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
 
-  FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${OPENCV_LIBS})
+  FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS})
     install(
       CODE
         "EXECUTE_PROCESS(
@@ -482,22 +461,11 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     )
   ENDFOREACH(plib)
 
-  set(OPENCV_VERSION "406")
   install(
     CODE
       "EXECUTE_PROCESS(
-        COMMAND ln -sf libopencv_video.so libopencv_video.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_videoio.so libopencv_videoio.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_highgui.so libopencv_highgui.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_imgcodecs.so libopencv_imgcodecs.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_imgproc.so libopencv_imgproc.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_core.so libopencv_core.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_calib3d.so libopencv_calib3d.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_features2d.so libopencv_features2d.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_flann.so libopencv_flann.so.${OPENCV_VERSION}
         COMMAND ln -sf libpng16.so libpng16.so.16
         COMMAND ln -sf libjpeg.so libjpeg.so.8
-        COMMAND ln -sf libcusparseLt.so libcusparseLt.so.0
         RESULT_VARIABLE LINK_STATUS
         WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
       if(LINK_STATUS AND NOT LINK_STATUS EQUAL 0)

From 8872ddafe9a8c3d48a31f0a20da8bd7a30256106 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:27:45 -0700
Subject: [PATCH 67/76] build: TPRD-1622 - Update version to CMake 4.0 or 3.31
 ( 3.31.8 ) which has support for CUDA new layout (#153)

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a50f667..3afe90b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,7 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cmake_minimum_required (VERSION 3.18)
+cmake_minimum_required (VERSION 3.31.8)
 
 project(tritonpytorchbackend LANGUAGES C CXX)
 

From db707513b5038f689d578e0f630f9e5a726fa925 Mon Sep 17 00:00:00 2001
From: Yingge He <157551214+yinggeh@users.noreply.github.com>
Date: Thu, 25 Sep 2025 09:53:30 -0700
Subject: [PATCH 68/76] fix: L0_pytorch_python_runtime--PyTorch (#156)

---
 tools/gen_pb_exec_env.sh | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tools/gen_pb_exec_env.sh b/tools/gen_pb_exec_env.sh
index d01a0d9..19539cd 100755
--- a/tools/gen_pb_exec_env.sh
+++ b/tools/gen_pb_exec_env.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -27,21 +27,23 @@
 
 # install conda
 rm -rf ./miniconda
-wget https://repo.anaconda.com/miniconda/Miniconda3-py312_24.9.2-0-Linux-x86_64.sh
-bash Miniconda3-py312_24.9.2-0-Linux-x86_64.sh -p ./miniconda -b
+wget https://repo.anaconda.com/miniconda/Miniconda3-py312_25.7.0-2-Linux-x86_64.sh
+bash Miniconda3-py312_25.7.0-2-Linux-x86_64.sh -p ./miniconda -b
 eval "$(./miniconda/bin/conda shell.bash hook)"
 
 # create conda environment
+conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
+conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
 conda create -n pt python=3.12 -y
 conda activate pt
 conda install -c conda-forge conda-pack -y
 
 # pre install step
 export PYTHONNOUSERSITE=True
-conda install -c conda-forge libstdcxx-ng=14 -y
+conda install -c conda-forge libstdcxx-ng=15 -y
 
 # install PyTorch
-conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
+conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia -y
 
 # pack environment
 rm -f pb_exec_env_model.py.tar.gz

From abafeb6388006d3d88293a7f2c05a536f8b93cdf Mon Sep 17 00:00:00 2001
From: J Wyman <jwyman@nvidia.com>
Date: Thu, 9 Oct 2025 15:15:42 -0400
Subject: [PATCH 69/76] docs: Update README to reflect torch 2 support (#160)

Update the README file to remove the "experimental" tag from the documentaion.

The existance of the tag was an oversight as Torch 2.x has been supported for 18+ months at this point.

Signed-off-by: J Wyman <jwyman@nvidia.com>
---
 README.md | 204 +++++++++++++++++++++---------------------------------
 1 file changed, 80 insertions(+), 124 deletions(-)

diff --git a/README.md b/README.md
index 2b27283..4c0081a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -81,8 +81,8 @@ Currently, Triton requires that a specially patched version of
 PyTorch be used with the PyTorch backend. The full source for
 these PyTorch versions are available as Docker images from
 [NGC](https://ngc.nvidia.com). For example, the PyTorch version
-compatible with the 22.12 release of Triton is available as
-nvcr.io/nvidia/pytorch:22.12-py3.
+compatible with the 25.09 release of Triton is available as
+nvcr.io/nvidia/pytorch:25.09-py3.
 
 Copy over the LibTorch and Torchvision headers and libraries from the
 [PyTorch NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
@@ -246,6 +246,79 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
 
     `ENABLE_JIT_PROFILING`
 
+### PyTorch 2.0 Models
+
+The model repository should look like:
+
+```bash
+model_repository/
+`-- model_directory
+    |-- 1
+    |   |-- model.py
+    |   `-- [model.pt]
+    `-- config.pbtxt
+```
+
+The `model.py` contains the class definition of the PyTorch model.
+The class should extend the
+[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
+The `model.pt` may be optionally provided which contains the saved
+[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference)
+of the model.
+
+### TorchScript Models
+
+The model repository should look like:
+
+```bash
+model_repository/
+`-- model_directory
+    |-- 1
+    |   `-- model.pt
+    `-- config.pbtxt
+```
+
+The `model.pt` is the TorchScript model file.
+
+### Customization
+
+The following PyTorch settings may be customized by setting parameters on the
+`config.pbtxt`.
+
+[`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads)
+
+* Key: `NUM_THREADS`
+* Value: The number of threads used for intra-op parallelism on CPU.
+
+[`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads)
+
+* Key: `NUM_INTEROP_THREADS`
+* Value: The number of threads used for interop parallelism (e.g. in JIT interpreter) on CPU.
+
+[`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile)
+
+* Key: `TORCH_COMPILE_OPTIONAL_PARAMETERS`
+* Value: Any of following parameter(s) encoded as a JSON object.
+  * `fullgraph` (`bool`): Whether it is ok to break model into several subgraphs.
+  * `dynamic` (`bool`): Use dynamic shape tracing.
+  * `backend` (`str`): The backend to be used.
+  * `mode` (`str`): Can be either `"default"`, `"reduce-overhead"`, or `"max-autotune"`.
+  * `options` (`dict`): A dictionary of options to pass to the backend.
+  * `disable` (`bool`): Turn `torch.compile()` into a no-op for testing.
+
+For example:
+
+```proto
+parameters: {
+  key: "NUM_THREADS"
+  value: { string_value: "4" }
+}
+parameters: {
+  key: "TORCH_COMPILE_OPTIONAL_PARAMETERS"
+  value: { string_value: "{\"disable\": true}" }
+}
+```
+
 ### Support
 
 #### Model Instance Group Kind
@@ -306,126 +379,9 @@ instance in the
 to ensure that the model instance and the tensors used for inference are
 assigned to the same GPU device as on which the model was traced.
 
-# PyTorch 2.0 Backend \[Experimental\]
-
-> [!WARNING]
-> *This feature is subject to change and removal.*
-
-Starting from 24.01, PyTorch models can be served directly via
-[Python runtime](src/model.py). By default, Triton will use the
-[LibTorch runtime](#pytorch-libtorch-backend) for PyTorch models. To use Python
-runtime, provide the following
-[runtime setting](https://github.com/triton-inference-server/backend/blob/main/README.md#backend-shared-library)
-in the model configuration:
-
-```
-runtime: "model.py"
-```
-
-## Dependencies
+* Python functions optimizable by `torch.compile` may not be served directly in the `model.py` file, they need to be enclosed by a class extending the
+  [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
 
-### Python backend dependency
+* Model weights cannot be shared across multiple instances on the same GPU device.
 
-This feature depends on
-[Python backend](https://github.com/triton-inference-server/python_backend),
-see
-[Python-based Backends](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md)
-for more details.
-
-### PyTorch dependency
-
-This feature will take advantage of the
-[`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile)
-optimization, make sure the
-[PyTorch 2.0+ pip package](https://pypi.org/project/torch) is available in the
-same Python environment.
-
-Alternatively, a [Python Execution Environment](#using-custom-python-execution-environments)
-with the PyTorch dependency may be used. It can be created with the
-[provided script](tools/gen_pb_exec_env.sh). The resulting
-`pb_exec_env_model.py.tar.gz` file should be placed at the same
-[backend shared library](https://github.com/triton-inference-server/backend/blob/main/README.md#backend-shared-library)
-directory as the [Python runtime](src/model.py).
-
-## Model Layout
-
-### PyTorch 2.0 models
-
-The model repository should look like:
-
-```
-model_repository/
-`-- model_directory
-    |-- 1
-    |   |-- model.py
-    |   `-- [model.pt]
-    `-- config.pbtxt
-```
-
-The `model.py` contains the class definition of the PyTorch model. The class
-should extend the
-[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
-The `model.pt` may be optionally provided which contains the saved
-[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference)
-of the model.
-
-### TorchScript models
-
-The model repository should look like:
-
-```
-model_repository/
-`-- model_directory
-    |-- 1
-    |   `-- model.pt
-    `-- config.pbtxt
-```
-
-The `model.pt` is the TorchScript model file.
-
-## Customization
-
-The following PyTorch settings may be customized by setting parameters on the
-`config.pbtxt`.
-
-[`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads)
-- Key: NUM_THREADS
-- Value: The number of threads used for intraop parallelism on CPU.
-
-[`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads)
-- Key: NUM_INTEROP_THREADS
-- Value: The number of threads used for interop parallelism (e.g. in JIT
-interpreter) on CPU.
-
-[`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile)
-- Key: TORCH_COMPILE_OPTIONAL_PARAMETERS
-- Value: Any of following parameter(s) encoded as a JSON object.
-  - fullgraph (*bool*): Whether it is ok to break model into several subgraphs.
-  - dynamic (*bool*): Use dynamic shape tracing.
-  - backend (*str*): The backend to be used.
-  - mode (*str*): Can be either "default", "reduce-overhead" or "max-autotune".
-  - options (*dict*): A dictionary of options to pass to the backend.
-  - disable (*bool*): Turn `torch.compile()` into a no-op for testing.
-
-For example:
-```
-parameters: {
-    key: "NUM_THREADS"
-    value: { string_value: "4" }
-}
-parameters: {
-    key: "TORCH_COMPILE_OPTIONAL_PARAMETERS"
-    value: { string_value: "{\"disable\": true}" }
-}
-```
-
-## Limitations
-
-Following are few known limitations of this feature:
-- Python functions optimizable by `torch.compile` may not be served directly in
-the `model.py` file, they need to be enclosed by a class extending the
-[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
-- Model weights cannot be shared across multiple instances on the same GPU
-device.
-- When using `KIND_MODEL` as model instance kind, the default device of the
-first parameter on the model is used.
+* When using `KIND_MODEL` as model instance kind, the default device of the first parameter on the model is used.

From b855365488d4b6f52cd69588699e578e3fa215a1 Mon Sep 17 00:00:00 2001
From: Kevin Pedro <kpedro88@gmail.com>
Date: Fri, 10 Oct 2025 17:47:38 -0500
Subject: [PATCH 70/76] Protect thread setting call (#159)

* use call_once to prevent repeated thread count setting

* update docs for thread parameters

* always emit a message with number of threads

* apply formatting

* readme formatting
---
 README.md       | 12 +++++++++++-
 src/libtorch.cc | 30 +++++++++++++++++++++++++-----
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 4c0081a..fe816e3 100644
--- a/README.md
+++ b/README.md
@@ -200,7 +200,7 @@ key: "ENABLE_CACHE_CLEANING"
 * `INTER_OP_THREAD_COUNT`:
 
 PyTorch allows using multiple CPU threads during TorchScript model inference.
-One or more inference threads execute a model’s forward pass on the given
+One or more inference threads execute a model's forward pass on the given
 inputs. Each inference thread invokes a JIT interpreter that executes the ops
 of a model inline, one by one. This parameter sets the size of this thread
 pool. The default value of this setting is the number of cpu cores. Please refer
@@ -218,6 +218,11 @@ key: "INTER_OP_THREAD_COUNT"
 }
 ```
 
+> [!NOTE]
+> This parameter is set globally for the PyTorch backend.
+> The value from the first model config file that specifies this parameter will be used.
+> Subsequent values from other model config files, if different, will be ignored.
+
 * `INTRA_OP_THREAD_COUNT`:
 
 In addition to the inter-op parallelism, PyTorch can also utilize multiple threads
@@ -238,6 +243,11 @@ key: "INTRA_OP_THREAD_COUNT"
 }
 ```
 
+> [!NOTE]
+> This parameter is set globally for the PyTorch backend.
+> The value from the first model config file that specifies this parameter will be used.
+> Subsequent values from other model config files, if different, will be ignored.
+
 * Additional Optimizations: Three additional boolean parameters are available to disable
 certain Torch optimizations that can sometimes cause latency regressions in models with
 complex execution modes and dynamic shapes. If not specified, all are enabled by default.
diff --git a/src/libtorch.cc b/src/libtorch.cc
index 26a2960..c873375 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -28,6 +28,7 @@
 
 #include <cstdint>
 #include <exception>
+#include <mutex>
 
 #include "libtorch_utils.h"
 #include "triton/backend/backend_common.h"
@@ -66,6 +67,11 @@
 // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
 //
 
+namespace {
+std::once_flag pytorch_interop_threads_flag;
+std::once_flag pytorch_intraop_threads_flag;
+}  // namespace
+
 namespace triton { namespace backend { namespace pytorch {
 
 //
@@ -509,11 +515,15 @@ ModelState::ParseParameters()
       }
     } else {
       if (intra_op_thread_count > 0) {
-        at::set_num_threads(intra_op_thread_count);
+        // at::set_num_threads() does not throw if called more than once, but
+        // issues warnings. std::call_once() is useful to limit these.
+        std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() {
+          at::set_num_threads(intra_op_thread_count);
+        });
         LOG_MESSAGE(
             TRITONSERVER_LOG_INFO,
             (std::string("Intra op thread count is set to ") +
-             std::to_string(intra_op_thread_count) + " for model instance '" +
+             std::to_string(at::get_num_threads()) + " for model instance '" +
              Name() + "'")
                 .c_str());
       }
@@ -533,12 +543,22 @@ ModelState::ParseParameters()
       }
     } else {
       if (inter_op_thread_count > 0) {
-        at::set_num_interop_threads(inter_op_thread_count);
+        // at::set_num_interop_threads() throws if called more than once.
+        // std::call_once() should prevent this, but try/catch is additionally
+        // used for safety.
+        std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() {
+          try {
+            at::set_num_interop_threads(inter_op_thread_count);
+          }
+          catch (const c10::Error& e) {
+            // do nothing
+          }
+        });
         LOG_MESSAGE(
             TRITONSERVER_LOG_INFO,
             (std::string("Inter op thread count is set to ") +
-             std::to_string(inter_op_thread_count) + " for model instance '" +
-             Name() + "'")
+             std::to_string(at::get_num_interop_threads()) +
+             " for model instance '" + Name() + "'")
                 .c_str());
       }
     }

From 6d02d3506ee5592e102ea10fb893c74303e6ae45 Mon Sep 17 00:00:00 2001
From: J Wyman <jwyman@nvidia.com>
Date: Tue, 14 Oct 2025 12:32:57 -0400
Subject: [PATCH 71/76] docs: Improve README (#161)

* docs: Improve README

Update the README file to improve readability, clarity, and markdown formatting standards.

Cleaned up and improvements were applied to the document:

- Correctly indented content within bullet-point lists.
- Used GitHub flavored markdown to enable NOTE, TIP, etc. callouts where appropriate.
- Reflowed text to use the more standard new-line after every sentance style.
- Replaced works this "this", "here", and "link" with more descriptive terms for link text.
- Corrected the format of the pbtxt examples.
- Updated the recommended pytorch container from NGC.

* fixup: Resolve Code Review

* docs: Additional Improvements

Use yaml instead of proto for code blocks to improve readability.

Add example instance_group configuration.

* revert: proto -> yaml markup
---
 README.md | 507 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 261 insertions(+), 246 deletions(-)

diff --git a/README.md b/README.md
index fe816e3..dc227e1 100644
--- a/README.md
+++ b/README.md
@@ -26,197 +26,237 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 -->
 
+# PyTorch (LibTorch) Backend
+
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
-# PyTorch (LibTorch) Backend
+The Triton backend for
+[PyTorch](https://github.com/pytorch/pytorch)
+is designed to run
+[TorchScript](https://pytorch.org/docs/stable/jit.html)
+models using the PyTorch C++ API.
+All models created in PyTorch using the python API must be traced/scripted to produce a TorchScript model.
+
+You can learn more about Triton backends in the
+[Triton Backend](https://github.com/triton-inference-server/backend)
+repository.
+
+Ask questions or report problems using
+[Triton Server issues](https://github.com/triton-inference-server/server/issues).
 
-The Triton backend for [PyTorch](https://github.com/pytorch/pytorch).
-You can learn more about Triton backends in the [backend
-repo](https://github.com/triton-inference-server/backend). Ask
-questions or report problems on the [issues
-page](https://github.com/triton-inference-server/server/issues).
-This backend is designed to run [TorchScript](https://pytorch.org/docs/stable/jit.html)
-models using the PyTorch C++ API. All models created in PyTorch
-using the python API must be traced/scripted to produce a TorchScript
-model.
-
-Where can I ask general questions about Triton and Triton backends?
-Be sure to read all the information below as well as the [general
-Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server)
-available in the main [server](https://github.com/triton-inference-server/server)
-repo. If you don't find your answer there you can ask questions on the
-main Triton [issues page](https://github.com/triton-inference-server/server/issues).
+Be sure to read all the information below as well as the
+[general Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server)
+available in the [Triton Server](https://github.com/triton-inference-server/server) repository.
 
 ## Build the PyTorch Backend
 
-Use a recent cmake to build. First install the required dependencies.
+Use a recent cmake to build.
+First install the required dependencies.
 
-```
-$ apt-get install rapidjson-dev python3-dev python3-pip
-$ pip3 install patchelf==0.17.2
+```bash
+apt-get install rapidjson-dev python3-dev python3-pip
+pip3 install patchelf==0.17.2
 ```
 
-An appropriate PyTorch container from [NGC](https://ngc.nvidia.com) must be used.
-For example, to build a backend that uses the 23.04 version of the PyTorch
-container from NGC:
+An appropriate PyTorch container from [NVIDIA NGC Catalog](https://ngc.nvidia.com) must be used.
+For example, to build a backend that uses the 23.04 version of the PyTorch container from NGC:
 
-```
-$ mkdir build
-$ cd build
-$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:23.04-py3" ..
-$ make install
+```bash
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:23.04-py3" ..
+make install
 ```
 
-The following required Triton repositories will be pulled and used in
-the build. By default, the "main" branch/tag will be used for each repo
-but the listed CMake argument can be used to override.
+The following required Triton repositories will be pulled and used in the build.
+By default, the `main` head will be used for each repository but the listed CMake argument can be used to override the value.
 
-* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
-* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
-* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
+* triton-inference-server/backend: `-DTRITON_BACKEND_REPO_TAG=[tag]`
+* triton-inference-server/core: `-DTRITON_CORE_REPO_TAG=[tag]`
+* triton-inference-server/common: `-DTRITON_COMMON_REPO_TAG=[tag]`
 
 ## Build the PyTorch Backend With Custom PyTorch
 
-Currently, Triton requires that a specially patched version of
-PyTorch be used with the PyTorch backend. The full source for
-these PyTorch versions are available as Docker images from
-[NGC](https://ngc.nvidia.com). For example, the PyTorch version
-compatible with the 25.09 release of Triton is available as
-nvcr.io/nvidia/pytorch:25.09-py3.
+Currently, Triton requires that a specially patched version of PyTorch be used with the PyTorch backend.
+The full source for these PyTorch versions are available as Docker images from
+[NGC](https://ngc.nvidia.com).
 
-Copy over the LibTorch and Torchvision headers and libraries from the
+For example, the PyTorch version compatible with the 25.09 release of Triton is available as `nvcr.io/nvidia/pytorch:25.09-py3` which supports PyTorch version `2.9.0a0`.
+
+> [!NOTE]
+> Additional details and version information can be found in the container's
+> [release notes](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-09.html#rel-25-09).
+
+Copy over the LibTorch and TorchVision headers and libraries from the
 [PyTorch NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
-into local directories. You can see which headers and libraries
-are needed/copied from the docker.
+into local directories.
+You can see which headers and libraries are needed/copied from the docker.
 
-```
-$ mkdir build
-$ cd build
-$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="<PATH_PREFIX>/torch;<PATH_PREFIX>/torch/torch/csrc/api/include;<PATH_PREFIX>/torchvision" -DTRITON_PYTORCH_LIB_PATHS="<LIB_PATH_PREFIX>" ..
-$ make install
+```bash
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="<PATH_PREFIX>/torch;<PATH_PREFIX>/torch/torch/csrc/api/include;<PATH_PREFIX>/torchvision" -DTRITON_PYTORCH_LIB_PATHS="<LIB_PATH_PREFIX>" ..
+make install
 ```
 
 ## Using the PyTorch Backend
 
-### Parameters
+### PyTorch 2.0 Models
 
-Triton exposes some flags to control the execution mode of the TorchScript models through
-the Parameters section of the model's `config.pbtxt` file.
+The model repository should look like:
 
-* `DISABLE_OPTIMIZED_EXECUTION`: Boolean flag to disable the optimized execution
-of TorchScript models. By default, the optimized execution is always enabled.
+```bash
+model_repository/
+`-- model_directory
+    |-- 1
+    |   |-- model.py
+    |   `-- [model.pt]
+    `-- config.pbtxt
+```
 
-The initial calls to a loaded TorchScript model take extremely long. Due to this longer
-model warmup [issue](https://github.com/pytorch/pytorch/issues/57894), Triton also allows
-execution of models without these optimizations. In some models, optimized execution
-does not benefit performance as seen [here](https://github.com/pytorch/pytorch/issues/19978)
-and in other cases impacts performance negatively, as seen [here](https://github.com/pytorch/pytorch/issues/53824).
+The `model.py` contains the class definition of the PyTorch model.
+The class should extend the
+[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
+The `model.pt` may be optionally provided which contains the saved
+[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference)
+of the model.
 
-The section of model config file specifying this parameter will look like:
+### TorchScript Models
 
-```
-parameters: {
-key: "DISABLE_OPTIMIZED_EXECUTION"
-    value: {
-    string_value: "true"
-    }
-}
+The model repository should look like:
+
+```bash
+model_repository/
+`-- model_directory
+    |-- 1
+    |   `-- model.pt
+    `-- config.pbtxt
 ```
 
-* `INFERENCE_MODE`: Boolean flag to enable the Inference Mode execution
-of TorchScript models. By default, the inference mode is enabled.
+The `model.pt` is the TorchScript model file.
 
-[InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new
-RAII guard analogous to NoGradMode to be used when you are certain your operations
-will have no interactions with autograd. Compared to NoGradMode, code run under
-this mode gets better performance by disabling autograd.
+## Configuration
 
-Please note that in some models, InferenceMode might not benefit performance
-and in fewer cases might impact performance negatively.
+Triton exposes some flags to control the execution mode of the TorchScript models through the `Parameters` section of the model's `config.pbtxt` file.
 
-The section of model config file specifying this parameter will look like:
+### Parameters
 
-```
-parameters: {
-key: "INFERENCE_MODE"
-    value: {
-    string_value: "true"
-    }
-}
-```
+* `DISABLE_OPTIMIZED_EXECUTION`:
+  Boolean flag to disable the optimized execution of TorchScript models.
+  By default, the optimized execution is always enabled.
 
-* `DISABLE_CUDNN`: Boolean flag to disable the cuDNN library. By default, cuDNN is enabled.
+  The initial calls to a loaded TorchScript model take a significant amount of time.
+  Due to this longer model warmup
+  ([pytorch #57894](https://github.com/pytorch/pytorch/issues/57894)),
+  Triton also allows execution of models without these optimizations.
+  In some models, optimized execution does not benefit performance
+  ([pytorch #19978](https://github.com/pytorch/pytorch/issues/19978))
+  and in other cases impacts performance negatively
+  ([pytorch #53824](https://github.com/pytorch/pytorch/issues/53824)).
 
-[cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for
-deep neural networks. cuDNN provides highly tuned implementations for standard routines.
+  The section of model config file specifying this parameter will look like:
 
-Typically, models run with cuDNN enabled are faster. However there are some exceptions
-where using cuDNN can be slower, cause higher memory usage or result in errors.
+  ```proto
+  parameters: {
+    key: "DISABLE_OPTIMIZED_EXECUTION"
+    value: { string_value: "true" }
+  }
+  ```
 
+* `INFERENCE_MODE`:
 
-The section of model config file specifying this parameter will look like:
+  Boolean flag to enable the Inference Mode execution of TorchScript models.
+  By default, the inference mode is enabled.
 
-```
-parameters: {
-key: "DISABLE_CUDNN"
-    value: {
-    string_value: "true"
-    }
-}
-```
+  [InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new RAII guard analogous to `NoGradMode` to be used when you are certain your operations will have no interactions with autograd.
+  Compared to `NoGradMode`, code run under this mode gets better performance by disabling autograd.
 
-* `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to
-share weights. This optimization should not be used with stateful models. If not specified,
-weight sharing is disabled.
+  Please note that in some models, InferenceMode might not benefit performance and in fewer cases might impact performance negatively.
 
-The section of model config file specifying this parameter will look like:
+  To enable inference mode, use the configuration example below:
 
-```
-parameters: {
-key: "ENABLE_WEIGHT_SHARING"
-    value: {
-    string_value: "true"
-    }
-}
-```
+  ```proto
+  parameters: {
+    key: "INFERENCE_MODE"
+    value: { string_value: "true" }
+  }
+  ```
 
-* `ENABLE_CACHE_CLEANING`: Boolean flag to enable CUDA cache cleaning after each model execution.
-If not specified, cache cleaning is disabled. This flag has no effect if model is on CPU.
-Setting this flag to true will negatively impact the performance due to additional CUDA cache
-cleaning operation after each model execution. Therefore, you should only use this flag if you
-serve multiple models with Triton and encounter CUDA out of memory issue during model executions.
+* `DISABLE_CUDNN`:
 
-The section of model config file specifying this parameter will look like:
+  Boolean flag to disable the cuDNN library.
+  By default, cuDNN is enabled.
 
-```
-parameters: {
-key: "ENABLE_CACHE_CLEANING"
-    value: {
-    string_value:"true"
-    }
-}
-```
+  [cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for deep neural networks.
+  It provides highly tuned implementations for standard routines.
+
+  Typically, models run with cuDNN enabled execute faster.
+  However there are some exceptions where using cuDNN can be slower, cause higher memory usage, or result in errors.
+
+  To disable cuDNN, use the configuration example below:
+
+  ```proto
+  parameters: {
+    key: "DISABLE_CUDNN"
+    value: { string_value: "true" }
+  }
+  ```
+
+* `ENABLE_WEIGHT_SHARING`:
+
+  Boolean flag to enable model instances on the same device to share weights.
+  This optimization should not be used with stateful models.
+  If not specified, weight sharing is disabled.
+
+  To enable weight sharing, use the configuration example below:
+
+  ```proto
+  parameters: {
+    key: "ENABLE_WEIGHT_SHARING"
+    value: { string_value: "true" }
+  }
+  ```
+
+* `ENABLE_CACHE_CLEANING`:
+
+  Boolean flag to enable CUDA cache cleaning after each model execution.
+  If not specified, cache cleaning is disabled.
+  This flag has no effect if model is on CPU.
+
+  Setting this flag to true will likely negatively impact the performance due to additional CUDA cache cleaning operation after each model execution.
+  Therefore, you should only use this flag if you serve multiple models with Triton and encounter CUDA out-of-memory issues during model executions.
+
+  To enable cleaning of the CUDA cache after every execution, use the configuration example below:
+
+  ```proto
+  parameters: {
+    key: "ENABLE_CACHE_CLEANING"
+    value: { string_value: "true" }
+  }
+  ```
 
 * `INTER_OP_THREAD_COUNT`:
 
-PyTorch allows using multiple CPU threads during TorchScript model inference.
-One or more inference threads execute a model's forward pass on the given
-inputs. Each inference thread invokes a JIT interpreter that executes the ops
-of a model inline, one by one. This parameter sets the size of this thread
-pool. The default value of this setting is the number of cpu cores. Please refer
-to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
-document on how to set this parameter properly.
+  PyTorch allows using multiple CPU threads during TorchScript model inference.
+  One or more inference threads execute a model’s forward pass on the given inputs.
+  Each inference thread invokes a JIT interpreter that executes the ops of a model inline, one by one.
 
-The section of model config file specifying this parameter will look like:
+  This parameter sets the size of this thread pool.
+  The default value of this setting is the number of cpu cores.
 
-```
-parameters: {
-key: "INTER_OP_THREAD_COUNT"
-    value: {
-    string_value:"1"
-    }
-}
-```
+  > [!TIP]
+  > Refer to
+  > [CPU Threading TorchScript](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
+  > on how to set this parameter properly.
+
+  To set the inter-op thread count, use the configuration example below:
+
+  ```proto
+  parameters: {
+    key: "INTER_OP_THREAD_COUNT"
+    value: { string_value: "1" }
+  }
+  ```
 
 > [!NOTE]
 > This parameter is set globally for the PyTorch backend.
@@ -225,70 +265,68 @@ key: "INTER_OP_THREAD_COUNT"
 
 * `INTRA_OP_THREAD_COUNT`:
 
-In addition to the inter-op parallelism, PyTorch can also utilize multiple threads
-within the ops (intra-op parallelism). This can be useful in many cases, including
-element-wise ops on large tensors, convolutions, GEMMs, embedding lookups and
-others. The default value for this setting is the number of CPU cores. Please refer
-to [this](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
-document on how to set this parameter properly.
+  In addition to the inter-op parallelism, PyTorch can also utilize multiple threads within the ops (intra-op parallelism).
+  This can be useful in many cases, including element-wise ops on large tensors, convolutions, GEMMs, embedding lookups and others.
 
-The section of model config file specifying this parameter will look like:
+  The default value for this setting is the number of CPU cores.
 
-```
-parameters: {
-key: "INTRA_OP_THREAD_COUNT"
-    value: {
-    string_value:"1"
-    }
-}
-```
+  > [!TIP]
+  > Refer to
+  > [CPU Threading TorchScript](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
+  > on how to set this parameter properly.
 
-> [!NOTE]
-> This parameter is set globally for the PyTorch backend.
-> The value from the first model config file that specifies this parameter will be used.
-> Subsequent values from other model config files, if different, will be ignored.
+  To set the intra-op thread count, use the configuration example below:
+
+  ```proto
+  parameters: {
+    key: "INTRA_OP_THREAD_COUNT"
+    value: { string_value: "1" }
+  }
+  ```
 
-* Additional Optimizations: Three additional boolean parameters are available to disable
-certain Torch optimizations that can sometimes cause latency regressions in models with
-complex execution modes and dynamic shapes. If not specified, all are enabled by default.
+* **Additional Optimizations**:
+
+  Three additional boolean parameters are available to disable certain Torch optimizations that can sometimes cause latency regressions in models with complex execution modes and dynamic shapes.
+  If not specified, all are enabled by default.
 
     `ENABLE_JIT_EXECUTOR`
 
     `ENABLE_JIT_PROFILING`
 
-### PyTorch 2.0 Models
+### Model Instance Group Kind
 
-The model repository should look like:
+The PyTorch backend supports the following kinds of
+[Model Instance Groups](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
+where the input tensors are placed as follows:
 
-```bash
-model_repository/
-`-- model_directory
-    |-- 1
-    |   |-- model.py
-    |   `-- [model.pt]
-    `-- config.pbtxt
-```
+* `KIND_GPU`:
 
-The `model.py` contains the class definition of the PyTorch model.
-The class should extend the
-[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
-The `model.pt` may be optionally provided which contains the saved
-[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference)
-of the model.
+  Inputs are prepared on the GPU device associated with the model instance.
 
-### TorchScript Models
+* `KIND_CPU`:
 
-The model repository should look like:
+  Inputs are prepared on the CPU.
 
-```bash
-model_repository/
-`-- model_directory
-    |-- 1
-    |   `-- model.pt
-    `-- config.pbtxt
-```
+* `KIND_MODEL`:
 
-The `model.pt` is the TorchScript model file.
+  Inputs are prepared on the CPU.
+  When loading the model, the backend does not choose the GPU device for the model;
+  instead, it respects the device(s) specified in the model and uses them as they are during inference.
+
+  This is useful when the model internally utilizes multiple GPUs, as demonstrated in
+  [this example model](https://github.com/triton-inference-server/server/blob/main/qa/L0_libtorch_instance_group_kind_model/gen_models.py).
+
+  > [!IMPORTANT]
+  > If a device is not specified in the model, the backend uses the first available GPU device.
+
+To set the model instance group, use the configuration example below:
+
+```proto
+instance_group {
+   count: 2
+   kind: KIND_GPU
+}
+```
 
 ### Customization
 
@@ -329,69 +367,46 @@ parameters: {
 }
 ```
 
-### Support
+## Important Notes
 
-#### Model Instance Group Kind
+* The execution of PyTorch model on GPU is asynchronous in nature.
+  See
+  [CUDA Asynchronous Execution](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution)
+  for additional details.
+  Consequently, an error in PyTorch model execution may be raised during the next few inference requests to the server.
+  Setting environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will help in correctly debugging failing cases by forcing synchronous execution.
 
-The PyTorch backend supports the following kinds of
-[Model Instance Groups](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
-where the input tensors are placed as follows:
+  * The PyTorch model in such cases may or may not recover from the failed state and a restart of the server may be required to continue serving successfully.
 
-* `KIND_GPU`: Inputs are prepared on the GPU device associated with the model
-instance.
-
-* `KIND_CPU`: Inputs are prepared on the CPU.
-
-* `KIND_MODEL`: Inputs are prepared on the CPU. When loading the model, the
-backend does not choose the GPU device for the model; instead, it respects the
-device(s) specified in the model and uses them as they are during inference.
-This is useful when the model internally utilizes multiple GPUs, as demonstrated
-in this
-[example model](https://github.com/triton-inference-server/server/blob/main/qa/L0_libtorch_instance_group_kind_model/gen_models.py).
-If no device is specified in the model, the backend uses the first available
-GPU device. This feature is available starting in the 23.06 release.
-
-### Important Notes
-
-* The execution of PyTorch model on GPU is asynchronous in nature. See
-  [here](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution)
-  for more details. Consequently, an error in PyTorch model execution may
-  be raised during the next few inference requests to the server. Setting
-  environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will
-  help in correctly debugging failing cases by forcing synchronous execution.
-  * The PyTorch model in such cases may or may not recover from the failed
-    state and a restart of the server may be required to continue serving
-    successfully.
-
-* PyTorch does not support Tensor of Strings but it does support models that
-accept a List of Strings as input(s) / produces a List of String as output(s).
-For these models Triton allows users to pass String input(s)/receive String
-output(s) using the String datatype. As a limitation of using List instead of
-Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported
-for I/O of String type.
+* PyTorch does not support Tensor of Strings but it does support models that accept a List of Strings as input(s) / produces a List of String as output(s).
+  For these models Triton allows users to pass String input(s)/receive String output(s) using the String datatype.
+  As a limitation of using List instead of Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported for I/O of String type.
 
 * In a multi-GPU environment, a potential runtime issue can occur when using
-[Tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html)
-to generate a
-[TorchScript](https://pytorch.org/docs/stable/jit.html) model. This issue
-arises due to a device mismatch between the model instance and the tensor. By
-default, Triton creates a single execution instance of the model for each
-available GPU. The runtime error occurs when a request is sent to a model
-instance with a different GPU device from the one used during the TorchScript
-generation process. To address this problem, it is highly recommended to use
-[Scripting](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script)
-instead of Tracing for model generation in a multi-GPU environment. Scripting
-avoids the device mismatch issue and ensures compatibility with different GPUs
-when used with Triton. However, if using Tracing is unavoidable, there is a
-workaround available. You can explicitly specify the GPU device for the model
-instance in the
-[model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
-to ensure that the model instance and the tensors used for inference are
-assigned to the same GPU device as on which the model was traced.
-
-* Python functions optimizable by `torch.compile` may not be served directly in the `model.py` file, they need to be enclosed by a class extending the
-  [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
+  [Tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html)
+  to generate a
+  [TorchScript](https://pytorch.org/docs/stable/jit.html)
+  model.
+  This issue arises due to a device mismatch between the model instance and the tensor.
 
-* Model weights cannot be shared across multiple instances on the same GPU device.
+  By default, Triton creates a single execution instance of the model for each available GPU.
+  The runtime error occurs when a request is sent to a model instance with a different GPU device from the one used during the TorchScript generation process.
+
+  To address this problem, it is highly recommended to use
+  [Scripting](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script)
+  instead of Tracing for model generation in a multi-GPU environment.
+  Scripting avoids the device mismatch issue and ensures compatibility with different GPUs when used with Triton.
+
+  However, if using Tracing is unavoidable, there is a workaround available.
+  You can explicitly specify the GPU device for the model instance in the
+  [model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
+  to ensure that the model instance and the tensors used for inference are assigned to the same GPU device as on which the model was traced.
 
 * When using `KIND_MODEL` as model instance kind, the default device of the first parameter on the model is used.
+
+> [!WARNING]
+>
+> * Python functions optimizable by `torch.compile` may not be served directly in the `model.py` file, they need to be enclosed by a class extending the
+  [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
+>
+> * Model weights cannot be shared across multiple instances on the same GPU device.

From 92692f8f09cd611bef06c01f1d4ea7933e22a323 Mon Sep 17 00:00:00 2001
From: J Wyman <jwyman@nvidia.com>
Date: Wed, 29 Oct 2025 14:48:39 -0400
Subject: [PATCH 72/76] docs: Correct README Instructions (#164)

This change corrects the instruction for how to use PyTorch 2 with the backend.
---
 README.md | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index dc227e1..ccc803c 100644
--- a/README.md
+++ b/README.md
@@ -103,23 +103,23 @@ make install
 
 ### PyTorch 2.0 Models
 
+PyTorch 2.0 features are available.
+However, Triton's PyTorch backend requires a serialized representation of the model in the form a `model.pt` file.
+The serialized representation of the model can be generated using PyTorch's
+[`torch.save()`](https://docs.pytorch.org/tutorials/beginner/saving_loading_models.html#id1)
+function to generate the `model.pt` file.
+
 The model repository should look like:
 
 ```bash
 model_repository/
 `-- model_directory
     |-- 1
-    |   |-- model.py
-    |   `-- [model.pt]
+    |   `-- model.pt
     `-- config.pbtxt
 ```
 
-The `model.py` contains the class definition of the PyTorch model.
-The class should extend the
-[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
-The `model.pt` may be optionally provided which contains the saved
-[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference)
-of the model.
+Where `model.pt` is the serialized representation of the model.
 
 ### TorchScript Models
 
@@ -139,6 +139,17 @@ The `model.pt` is the TorchScript model file.
 
 Triton exposes some flags to control the execution mode of the TorchScript models through the `Parameters` section of the model's `config.pbtxt` file.
 
+### Configuration Options
+
+* `default_model_name`:
+  Instructs the Triton PyTorch backend to load the model from a file of the given name.
+
+  The model config specifying the option would look like:
+
+  ```proto
+  default_model_name: "another_file_name.pt"
+  ```
+
 ### Parameters
 
 * `DISABLE_OPTIMIZED_EXECUTION`:

From a95f663e67a713c792e48f2e3c0c6d282e63de84 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 09:43:09 -0800
Subject: [PATCH 73/76] Adding libtorch_nvshmem.so (#162) (#166)

* Adding libtorch_nvshmem.so

* change: CPU only build doesn't have CUDA_VERSION environment variable. Using flag to control library inclusion.

* Removing generation expression
---
 CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3afe90b..3ec2d55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,7 @@ option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
 option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
 option(TRITON_PYTORCH_ENABLE_TORCHTRT "Enable TorchTRT support" OFF)
 option(TRITON_PYTORCH_ENABLE_TORCHVISION "Enable Torchvision support" ON)
+option(TRITON_PYTORCH_NVSHMEM "Enable NVSHMEM support" ON)
 
 set(TRITON_PYTORCH_DOCKER_IMAGE "" CACHE STRING "Docker image containing the PyTorch build required by backend.")
 set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes")
@@ -162,6 +163,13 @@ set(PT_LIBS
     "libjpeg.so.62"
 )
 
+if (${TRITON_PYTORCH_NVSHMEM})
+  set(PT_LIBS
+      ${PT_LIBS}
+      "libtorch_nvshmem.so"
+  )
+endif() # TRITON_PYTORCH_NVSHMEM
+
 if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
   set(PT_LIBS
       ${PT_LIBS}
@@ -238,6 +246,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
     COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_NVSHMEM} = 'ON' ]; then docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_nvshmem.so libtorch_nvshmem.so; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -a -L pytorch_backend_ptlib:/usr/local/lib64/libtorchvision.so libtorchvision.so; else docker cp -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi"
     COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"

From 45bd8e5f6c5b853d9618b5eb06fbf17188694ceb Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Tue, 4 Nov 2025 10:19:28 -0800
Subject: [PATCH 74/76] fix(pre-commit): update hooks versions (#169)

---
 .github/workflows/pre-commit.yml | 10 +++++-----
 .pre-commit-config.yaml          |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index ab4bd95..4fa1873 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -1,4 +1,4 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -31,8 +31,8 @@ on:
 
 jobs:
   pre-commit:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v3
-    - uses: pre-commit/action@v3.0.0
+    - uses: actions/checkout@v5.0.0
+    - uses: actions/setup-python@v6.0.0
+    - uses: pre-commit/action@v3.0.1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 298baab..3c76a6e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 repos:
-- repo: https://github.com/timothycrosley/isort
+- repo: https://github.com/PyCQA/isort
   rev: 5.12.0
   hooks:
   - id: isort
@@ -36,7 +36,7 @@ repos:
   - id: black
     types_or: [python, cython]
 - repo: https://github.com/PyCQA/flake8
-  rev: 5.0.4
+  rev: 7.3.0
   hooks:
   - id: flake8
     args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
@@ -57,7 +57,7 @@ repos:
 # More details about these pre-commit hooks here:
 # https://pre-commit.com/hooks.html
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.4.0
+  rev: v6.0.0
   hooks:
   - id: check-case-conflict
   - id: check-executables-have-shebangs

From cb9336c0c05c349bfd54a611680840cabb2b1a03 Mon Sep 17 00:00:00 2001
From: J Wyman <jwyman@nvidia.com>
Date: Tue, 4 Nov 2025 16:59:30 -0500
Subject: [PATCH 75/76] maintenance: Separate Code into Separate Files (#163)

* maintenance: Separate Code into Separate Files

This change breaks the monolithic src/libtorch.cc into multiple files,
with a modern separation of classes into separate header and code files.

* Accept Rename

Renamed 'string_utilities.*' to 'string_utils.*' as requested.
---
 CMakeLists.txt              |    3 +
 src/libtorch.cc             | 2493 +----------------------------------
 src/libtorch.hh             |   59 +
 src/model_instance_state.cc | 1632 +++++++++++++++++++++++
 src/model_instance_state.hh |  178 +++
 src/model_state.cc          |  495 +++++++
 src/model_state.hh          |  131 ++
 src/naming_convention.hh    |   40 +
 src/string_utils.cc         |  254 ++++
 src/string_utils.hh         |  106 ++
 10 files changed, 2902 insertions(+), 2489 deletions(-)
 create mode 100644 src/libtorch.hh
 create mode 100644 src/model_instance_state.cc
 create mode 100644 src/model_instance_state.hh
 create mode 100644 src/model_state.cc
 create mode 100644 src/model_state.hh
 create mode 100644 src/naming_convention.hh
 create mode 100644 src/string_utils.cc
 create mode 100644 src/string_utils.hh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ec2d55..5b0e399 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -289,6 +289,9 @@ add_library(
   src/libtorch.cc
   src/libtorch_utils.cc
   src/libtorch_utils.h
+  src/model_instance_state.cc
+  src/model_state.cc
+  src/string_utils.cc
 )
 
 add_library(
diff --git a/src/libtorch.cc b/src/libtorch.cc
index c873375..500f1f5 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -24,2498 +24,13 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include <stdint.h>
-
-#include <cstdint>
-#include <exception>
-#include <mutex>
-
-#include "libtorch_utils.h"
-#include "triton/backend/backend_common.h"
-#include "triton/backend/backend_input_collector.h"
-#include "triton/backend/backend_memory.h"
-#include "triton/backend/backend_model.h"
-#include "triton/backend/backend_model_instance.h"
-#include "triton/backend/backend_output_responder.h"
-#include "triton/common/nvtx.h"
-#include "triton/core/tritonbackend.h"
-
-#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
-// Suppress warnings in torch headers
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsign-compare"
-#pragma warning(push, 0)
-#include <torchvision/ops/ops.h>
-#include <torchvision/vision.h>  // Torchvision header
-#pragma warning(pop)
-#pragma GCC diagnostic pop
-#endif  // TRITON_PYTORCH_ENABLE_TORCHVISION
-
-#ifdef TRITON_ENABLE_GPU
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda_runtime_api.h>
-#endif  // TRITON_ENABLE_GPU
-
-// for thread control
-// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api
-// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
-#include <ATen/Parallel.h>
-
+#include "libtorch.hh"
 
 //
 // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
 //
 
-namespace {
-std::once_flag pytorch_interop_threads_flag;
-std::once_flag pytorch_intraop_threads_flag;
-}  // namespace
-
-namespace triton { namespace backend { namespace pytorch {
-
-//
-// ModelState
-//
-// State associated with a model that is using this backend. An object
-// of this class is created and associated with each
-// TRITONBACKEND_Model.
-//
-class ModelState : public BackendModel {
- public:
-  static TRITONSERVER_Error* Create(
-      TRITONBACKEND_Model* triton_model, ModelState** state);
-  virtual ~ModelState() = default;
-
-  // Load a TorchScript model using 'artifact_name' as the name for the
-  // TorchScript file. Return in 'model_path' the full path to the
-  // TorchScript file, return in 'torch_model' the Torch Module
-  // representing the model.
-  TRITONSERVER_Error* LoadModel(
-      const std::string& artifact_name, const torch::Device device,
-      std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
-      std::shared_ptr<torch::jit::script::Module>* torch_model);
-
-  bool EnabledOptimizedExecution() { return enable_optimized_execution_; }
-  const std::pair<bool, bool>& EnabledTensorExprFuser() const
-  {
-    return enable_tensor_fuser_pair_;
-  }
-  const std::pair<bool, bool>& EnabledJitProfiling() const
-  {
-    return enable_jit_profiling_pair_;
-  }
-  const std::pair<bool, bool>& EnabledJitExecutor() const
-  {
-    return enable_jit_executor_pair_;
-  }
-  bool EnabledInferenceMode() { return enable_inference_mode_; }
-  bool EnabledCudnn() { return enable_cudnn_; }
-  bool EnabledCacheCleaning() { return enable_cache_cleaning_; }
-
-  bool EnabledWeightSharing() { return enable_weight_sharing_; }
-  const std::map<std::string, std::pair<int64_t, int64_t>>& ModelOutputs()
-  {
-    return model_outputs_;
-  }
-
- private:
-  ModelState(TRITONBACKEND_Model* triton_model);
-  TRITONSERVER_Error* AutoCompleteConfig();
-
-  // Parses and validates parameters in config
-  TRITONSERVER_Error* ParseParameters();
-
-  // Flag to indicate whether optimized execution is enabled. Defaults to true.
-  bool enable_optimized_execution_;
-
-  // Flag to indicate whether inference mode is enabled. Defaults to false.
-  bool enable_inference_mode_;
-
-  // Flag to indicate whether cudnn is enabled. Defaults to true.
-  bool enable_cudnn_;
-
-  // Flag to indicate whether cache cleaning after each run is enabled.
-  // Defaults to false.
-  bool enable_cache_cleaning_;
-
-  // Flag to indicate whether weight sharing is enabled. Defaults to false.
-  bool enable_weight_sharing_;
-
-  // Flag pairs to indicate if various JIT settings are set and
-  // enabled respectively. Defaults to (false, true). Default behavior
-  // is to do nothing if not explicitly set.
-  std::pair<bool, bool> enable_tensor_fuser_pair_;
-  std::pair<bool, bool> enable_jit_profiling_pair_;
-  std::pair<bool, bool> enable_jit_executor_pair_;
-
-  // Model mapping for shared TorchScript model across all instances on the
-  // same device. The key is a pair of isGPU and device index.
-  std::map<
-      std::pair<bool, int64_t>, std::shared_ptr<torch::jit::script::Module>>
-      torch_models_;
-
-  // model_outputs is a map that contains unique outputs that the model must
-  // provide. The first pair is the model output index and the second is
-  // the index in the model state, -1 is used if one is not required.
-  // In the model configuration, the output in the state configuration
-  // can have intersection with the outputs section of the model. If an output
-  // is specified both in the output section and state section, it indicates
-  // that the backend must return the output state to the client too.
-  std::map<std::string, std::pair<int64_t, int64_t>> model_outputs_;
-};
-
-TRITONSERVER_Error*
-ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
-{
-  try {
-    *state = new ModelState(triton_model);
-  }
-  catch (const BackendModelException& ex) {
-    RETURN_ERROR_IF_TRUE(
-        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
-        std::string("unexpected nullptr in BackendModelException"));
-    RETURN_IF_ERROR(ex.err_);
-  }
-
-  // Auto-complete the configuration if requested...
-  bool auto_complete_config = false;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(
-      triton_model, &auto_complete_config));
-  if (auto_complete_config) {
-    RETURN_IF_ERROR((*state)->AutoCompleteConfig());
-    RETURN_IF_ERROR((*state)->SetModelConfig());
-  }
-
-  auto& model_outputs = (*state)->model_outputs_;
-  // Parse the output states in the model configuration
-  triton::common::TritonJson::Value sequence_batching;
-  if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) {
-    triton::common::TritonJson::Value states;
-    if (sequence_batching.Find("state", &states)) {
-      for (size_t i = 0; i < states.ArraySize(); i++) {
-        triton::common::TritonJson::Value state;
-        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
-        std::string output_state_name;
-        RETURN_IF_ERROR(
-            state.MemberAsString("output_name", &output_state_name));
-        auto it = model_outputs.find(output_state_name);
-        if (it == model_outputs.end()) {
-          model_outputs.insert({output_state_name, std::make_pair(-1, i)});
-        } else {
-          it->second.second = i;
-        }
-      }
-    }
-  }
-
-  // Parse the output names in the model configuration
-  triton::common::TritonJson::Value outputs;
-  RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs));
-  for (size_t i = 0; i < outputs.ArraySize(); i++) {
-    triton::common::TritonJson::Value output;
-    THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output));
-
-    // Use names from ModelConfig by reference since the model
-    // config will persist longer than this inference execution.
-    std::string output_name;
-    THROW_IF_BACKEND_INSTANCE_ERROR(
-        output.MemberAsString("name", &output_name));
-
-    auto it = model_outputs.find(output_name);
-    if (it == model_outputs.end()) {
-      model_outputs.insert({output_name, std::make_pair(i, -1)});
-    } else {
-      it->second.first = i;
-    }
-  }
-
-  RETURN_IF_ERROR((*state)->ParseParameters());
-
-  return nullptr;  // success
-}
-
-ModelState::ModelState(TRITONBACKEND_Model* triton_model)
-    : BackendModel(triton_model), enable_optimized_execution_(true),
-      enable_inference_mode_(true), enable_cudnn_(true),
-      enable_cache_cleaning_(false), enable_weight_sharing_(false),
-      enable_tensor_fuser_pair_({false, true}),
-      enable_jit_profiling_pair_({false, true}),
-      enable_jit_executor_pair_({false, true})
-{
-}
-
-TRITONSERVER_Error*
-ModelState::LoadModel(
-    const std::string& artifact_name, const torch::Device device,
-    std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
-    std::shared_ptr<torch::jit::script::Module>* torch_model)
-{
-  // Find the TorchScript file that describes the model. If the model
-  // configuration doesn't have an explicit model file specified then
-  // use the default name ("model.pt").
-  std::string cc_model_filename = artifact_name;
-  if (cc_model_filename.empty()) {
-    cc_model_filename = "model.pt";
-  }
-
-  *model_path = JoinPath(
-      {RepositoryPath(), std::to_string(Version()), cc_model_filename});
-
-  {
-    bool exists;
-    RETURN_IF_ERROR(FileExists(*model_path, &exists));
-    RETURN_ERROR_IF_FALSE(
-        exists, TRITONSERVER_ERROR_UNAVAILABLE,
-        std::string("unable to find '") + *model_path +
-            "' for model instance '" + Name() + "'");
-  }
-
-  // If weight sharing is enabled, skip loading model if
-  // it is already available on the target device
-  std::pair<bool, int> device_pair;
-  if (enable_weight_sharing_) {
-    device_pair = std::make_pair(!device.is_cpu(), device.index());
-    auto mit = torch_models_.find(device_pair);
-    if (mit != torch_models_.end()) {
-      *torch_model = mit->second;
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Reusing TorchScript model for instance '") + Name() +
-           "'")
-              .c_str());
-      return nullptr;  // success
-    }
-  }
-
-  // Serialize the torch model to string
-  std::string model_data_str;
-  RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str));
-
-  // InferenceMode should be used to guard all tensors operations including
-  // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html
-  torch::InferenceMode infer_guard(EnabledInferenceMode());
-
-  try {
-    std::istringstream model_stream(model_data_str);
-    if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
-      // Load the model without selecting a device.
-      torch_model->reset(
-          new torch::jit::Module(torch::jit::load(model_stream)));
-    } else {
-      torch_model->reset(
-          new torch::jit::Module(torch::jit::load(model_stream, device)));
-    }
-  }
-  catch (const std::exception& ex) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        ("failed to load model '" + Name() + "': " + ex.what()).c_str());
-  }
-
-  if (enable_weight_sharing_) {
-    if (!((torch_models_.emplace(device_pair, *torch_model)).second)) {
-      std::string type = device.is_cpu() ? "CPU" : "GPU";
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_WARN,
-          (std::string("Model already found on target ") + type + " device " +
-           "(id " + std::to_string(device.index()) + ") for '" + Name() + "'")
-              .c_str());
-    }
-  }
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelState::AutoCompleteConfig()
-{
-  // Auto-complete configuration is not supported since PyTorch does not
-  // store/capture sufficient model metadata so just log error instead.
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_WARN,
-      (std::string("skipping model configuration auto-complete for '") +
-       Name() + "': not supported for pytorch backend")
-          .c_str());
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelState::ParseParameters()
-{
-  triton::common::TritonJson::Value params;
-  bool status = model_config_.Find("parameters", &params);
-  if (status) {
-    // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no
-    // update is made to 'enable_optimized_execution_'.
-    bool disable_optimized_execution = false;
-    TRITONSERVER_Error* err = ParseParameter(
-        params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    }
-    enable_optimized_execution_ = !disable_optimized_execution;
-
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO,
-        (std::string("Optimized execution is ") +
-         (enable_optimized_execution_ ? "enabled" : "disabled") +
-         " for model instance '" + Name() + "'")
-            .c_str());
-
-    // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then
-    // no update is made to 'enable_cache_cleaning_'.
-    err = ParseParameter(
-        params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    }
-
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO,
-        (std::string("Cache Cleaning is ") +
-         (enable_cache_cleaning_ ? "enabled" : "disabled") +
-         " for model instance '" + Name() + "'")
-            .c_str());
-
-    // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made
-    // to 'enable_inference_mode_'.
-    err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    }
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO,
-        (std::string("Inference Mode is ") +
-         (enable_inference_mode_ ? "enabled" : "disabled") +
-         " for model instance '" + Name() + "'")
-            .c_str());
-
-    // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made
-    // to 'enable_cudnn_'.
-    bool disable_cudnn = false;
-    err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    }
-    enable_cudnn_ = !disable_cudnn;
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO,
-        (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") +
-         " for model instance '" + Name() + "'")
-            .c_str());
-
-    // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no
-    // update is made to 'enable_tensor_fuser'.
-    bool enable_tensor_fuser = false;
-    err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      enable_tensor_fuser_pair_ = {true, enable_tensor_fuser};
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Tensor fuser is ") +
-           (enable_tensor_fuser ? "enabled" : "disabled") +
-           " for model instance '" + Name() + "'")
-              .c_str());
-    }
-
-    // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no
-    // update is made to 'enable_weight_sharing'.
-    err = ParseParameter(
-        params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Weight sharing is ") +
-           (enable_weight_sharing_ ? "enabled" : "disabled") +
-           " for model instance '" + Name() + "'")
-              .c_str());
-    }
-
-    // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update
-    // is made to 'enable_jit_profiling'.
-    bool enable_jit_profiling = false;
-    err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      enable_jit_profiling_pair_ = {true, enable_jit_profiling};
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Jit profiling is ") +
-           (enable_jit_profiling ? "enabled" : "disabled") +
-           " for model instance '" + Name() + "'")
-              .c_str());
-    }
-
-    // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is
-    // made to 'enable_jit_executor'.
-    bool enable_jit_executor = false;
-    err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      enable_jit_executor_pair_ = {true, enable_jit_executor};
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Jit executor is ") +
-           (enable_jit_executor ? "enabled" : "disabled") +
-           " for model instance '" + Name() + "'")
-              .c_str());
-    }
-
-    // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update
-    // is made to 'intra_op_thread_count', which by default will take all
-    // threads
-    int intra_op_thread_count = -1;
-    err =
-        ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      if (intra_op_thread_count > 0) {
-        // at::set_num_threads() does not throw if called more than once, but
-        // issues warnings. std::call_once() is useful to limit these.
-        std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() {
-          at::set_num_threads(intra_op_thread_count);
-        });
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_INFO,
-            (std::string("Intra op thread count is set to ") +
-             std::to_string(at::get_num_threads()) + " for model instance '" +
-             Name() + "'")
-                .c_str());
-      }
-    }
-
-    // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update
-    // is made to 'inter_op_thread_count', which by default will take all
-    // threads
-    int inter_op_thread_count = -1;
-    err =
-        ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      if (inter_op_thread_count > 0) {
-        // at::set_num_interop_threads() throws if called more than once.
-        // std::call_once() should prevent this, but try/catch is additionally
-        // used for safety.
-        std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() {
-          try {
-            at::set_num_interop_threads(inter_op_thread_count);
-          }
-          catch (const c10::Error& e) {
-            // do nothing
-          }
-        });
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_INFO,
-            (std::string("Inter op thread count is set to ") +
-             std::to_string(at::get_num_interop_threads()) +
-             " for model instance '" + Name() + "'")
-                .c_str());
-      }
-    }
-  }
-
-  return nullptr;
-}
-
-// The naming convention followed for inputs/outputs in the model configuration.
-// Outputs don't support FORWARD_ARGUMENT.
-enum class NamingConvention {
-  NAMED_INDEX,
-  FORWARD_ARGUMENT,
-  STRICT_CONFIG_ORDERING
-};
-
-//
-// ModelInstanceState
-//
-// State associated with a model instance. An object of this class is
-// created and associated with each TRITONBACKEND_ModelInstance.
-//
-class ModelInstanceState : public BackendModelInstance {
- public:
-  static TRITONSERVER_Error* Create(
-      ModelState* model_state,
-      TRITONBACKEND_ModelInstance* triton_model_instance,
-      ModelInstanceState** state);
-  virtual ~ModelInstanceState();
-
-  // Get the state of the model that corresponds to this instance.
-  ModelState* StateForModel() const { return model_state_; }
-
-  // Execute...
-  void ProcessRequests(
-      TRITONBACKEND_Request** requests, const uint32_t request_count);
-
-  // Clear CUDA cache
-  void ClearCache();
-
- private:
-  ModelInstanceState(
-      ModelState* model_state,
-      TRITONBACKEND_ModelInstance* triton_model_instance);
-  TRITONSERVER_Error* ValidateBooleanSequenceControl(
-      triton::common::TritonJson::Value& sequence_batching,
-      const std::string& control_kind, bool required, bool* have_control);
-  TRITONSERVER_Error* ValidateTypedSequenceControl(
-      triton::common::TritonJson::Value& sequence_batching,
-      const std::string& control_kind, bool required, bool* have_control);
-  TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt);
-  void AddInputToMap(
-      NamingConvention naming_convention,
-      const std::vector<std::string> allowed_inputs, const std::string& io_name,
-      const uint32_t index);
-  TRITONSERVER_Error* ValidateOutputs();
-  void Execute(
-      std::vector<TRITONBACKEND_Response*>* responses,
-      const uint32_t response_count,
-      std::vector<torch::jit::IValue>* input_tensors,
-      std::vector<torch::jit::IValue>* output_tensors);
-  TRITONSERVER_Error* SetInputTensors(
-      size_t total_batch_size, TRITONBACKEND_Request** requests,
-      const uint32_t request_count,
-      std::vector<TRITONBACKEND_Response*>* responses,
-      BackendInputCollector* collector, std::vector<const char*>* input_names,
-      std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy);
-  TRITONSERVER_Error* ReadOutputTensors(
-      size_t total_batch_size,
-      const std::vector<torch::jit::IValue>& output_tensors,
-      TRITONBACKEND_Request** requests, const uint32_t request_count,
-      std::vector<TRITONBACKEND_Response*>* responses);
-  TRITONSERVER_Error* RecordBackendTimestamp(
-      uint64_t* timestamp, void* cuda_event);
-
-  // Get the naming convention for inputs/outputs from the model configuration
-  TRITONSERVER_Error* GetNamingConvention(
-      NamingConvention* naming_convention,
-      const std::vector<std::string>& allowed_io);
-
-  // Create CUDA events for statistics collection.
-  void CreateCudaEvents(const int32_t& device_id);
-
-  // Get the appropriate CUDA stream for input and output handling based on the
-  // instance group type.
-  cudaStream_t GetCudaStreamByInstanceKind();
-
-  // Replace the default CUDA stream with the stream we created to ensure proper
-  // cuda stream synchronization.
-  void SetCurrentCudaStream(
-      const cudaStream_t& stream, const int32_t& device_id);
-
-  // Get the elapsed time between two CUDA events.
-  float GetCudaEventElapsedTime(
-      const cudaEvent_t& start_event, const cudaEvent_t& end_event);
-
-  ModelState* model_state_;
-
-  // The full path to the TorchScript model file.
-  std::string model_path_;
-
-  std::shared_ptr<torch::jit::script::Module> torch_model_;
-  torch::Device device_;
-
-  // Map from configuration name for an input to the index of
-  // that input in the model.
-  std::unordered_map<std::string, int> input_index_map_;
-  uint32_t batch_input_count_ = 0;
-
-  // Map from configuration name for an output to the index of
-  // that output in the model.
-  std::unordered_map<std::string, int> output_index_map_;
-  std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
-
-  // If the input to the tensor is a dictionary of tensors.
-  bool is_dict_input_;
-
-  // If the model supports batching.
-  bool supports_batching_;
-
-  cudaEvent_t compute_input_start_event_;
-  cudaEvent_t compute_infer_start_event_;
-  cudaEvent_t compute_output_start_event_;
-
-  // Store the cuda streams created for the 'KIND_MODEL' instance group.
-  std::vector<cudaStream_t> stream_vec_;
-
-  // The number of available devices.
-  int device_cnt_;
-};
-
-TRITONSERVER_Error*
-ModelInstanceState::Create(
-    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
-    ModelInstanceState** state)
-{
-  try {
-    *state = new ModelInstanceState(model_state, triton_model_instance);
-  }
-  catch (const BackendModelInstanceException& ex) {
-    RETURN_ERROR_IF_TRUE(
-        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
-        std::string("unexpected nullptr in BackendModelInstanceException"));
-    RETURN_IF_ERROR(ex.err_);
-  }
-
-  return nullptr;  // success
-}
-
-ModelInstanceState::ModelInstanceState(
-    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
-    : BackendModelInstance(model_state, triton_model_instance),
-      model_state_(model_state), device_(torch::kCPU), is_dict_input_(false),
-      device_cnt_(0)
-{
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
-#ifdef TRITON_ENABLE_GPU
-    device_ = torch::Device(torch::kCUDA, DeviceId());
-    CreateCudaEvents(DeviceId());
-#endif
-  }
-
-#ifdef TRITON_ENABLE_GPU
-  device_cnt_ = torch::cuda::device_count();
-#endif
-
-  THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(
-      ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_));
-
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
-#ifdef TRITON_ENABLE_GPU
-    // Since we cannot determine the exact devices used by the model, we create
-    // a CUDA stream for every available device to ensure proper synchronization
-    // of CUDA streams. This approach may have implications when a timestamp is
-    // captured on a device that is not used by the model. Currently, this issue
-    // is addressed by synchronizing the CUDA streams before recording
-    // timestamps to prevent timestamp skewing. However, in the future, any
-    // modifications to the CUDA stream synchronization logic should be handled
-    // with caution.
-    for (int i = 0; i < device_cnt_; i++) {
-      cudaStream_t stream;
-      THROW_IF_BACKEND_INSTANCE_ERROR(
-          CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream));
-      stream_vec_.push_back(stream);
-    }
-    if (!stream_vec_.empty()) {
-      // Create CUDA events on the first device that will be used for collecting
-      // inputs/outputs.
-      CreateCudaEvents(0);
-    }
-#endif
-  }
-
-  size_t expected_input_cnt = 0;
-  {
-    triton::common::TritonJson::Value inputs;
-    if (model_state->ModelConfig().Find("input", &inputs)) {
-      expected_input_cnt = inputs.ArraySize();
-    }
-
-    triton::common::TritonJson::Value config_batch_inputs;
-    if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) {
-      batch_input_count_ = config_batch_inputs.ArraySize();
-      expected_input_cnt += batch_input_count_;
-    }
-  }
-
-  // If this is a sequence model then make sure that the required
-  // inputs are present in the model and have the correct shape and
-  // datatype.
-  triton::common::TritonJson::Value sequence_batching;
-  if (model_state->ModelConfig().Find(
-          "sequence_batching", &sequence_batching)) {
-    bool have_start, have_end, have_ready, have_corrid;
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
-        sequence_batching, "CONTROL_SEQUENCE_START", false /* required */,
-        &have_start));
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
-        sequence_batching, "CONTROL_SEQUENCE_END", false /* required */,
-        &have_end));
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
-        sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */,
-        &have_ready));
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl(
-        sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */,
-        &have_corrid));
-    if (have_start) {
-      expected_input_cnt += 1;
-    }
-    if (have_end) {
-      expected_input_cnt += 1;
-    }
-    if (have_ready) {
-      expected_input_cnt += 1;
-    }
-    if (have_corrid) {
-      expected_input_cnt += 1;
-    }
-    // Add the state inputs to the expected count
-    triton::common::TritonJson::Value states;
-    if (sequence_batching.Find("state", &states)) {
-      expected_input_cnt += states.ArraySize();
-    }
-  }
-  supports_batching_ = model_state_->MaxBatchSize() > 0;
-
-  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt));
-  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
-}
-
-void
-ModelInstanceState::ClearCache()
-{
-#ifdef TRITON_ENABLE_GPU
-  if (device_.is_cuda() ||
-      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
-    c10::cuda::CUDACachingAllocator::emptyCache();
-  }
-#endif  // TRITON_ENABLE_GPU
-}
-
-ModelInstanceState::~ModelInstanceState()
-{
-  torch_model_.reset();
-  ClearCache();
-
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
-#ifdef TRITON_ENABLE_GPU
-    for (size_t i = 0; i < stream_vec_.size(); i++) {
-      LOG_IF_ERROR(
-          ConvertCUDAStatusToTritonError(
-              cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL,
-              "Failed to set the device"),
-          "Failed to set the device");
-
-      LOG_IF_ERROR(
-          ConvertCUDAStatusToTritonError(
-              cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL,
-              "Failed to destroy cuda stream"),
-          "~ModelInstanceState error: ");
-      stream_vec_[i] = nullptr;
-    }
-#endif
-  }
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ValidateBooleanSequenceControl(
-    triton::common::TritonJson::Value& sequence_batching,
-    const std::string& control_kind, bool required, bool* have_control)
-{
-  std::string tensor_name;
-  std::string tensor_datatype;
-  RETURN_IF_ERROR(GetBooleanSequenceControlProperties(
-      sequence_batching, model_state_->Name(), control_kind, required,
-      &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr,
-      nullptr, nullptr));
-  *have_control = !tensor_name.empty();
-  if (*have_control) {
-    std::string deliminator = "__";
-    int ip_index = 0;
-    int start_pos = tensor_name.find(deliminator);
-    if (start_pos == -1) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("input '" + tensor_name +
-           "' does not follow <name>__<index> naming convention.")
-              .c_str());
-    }
-
-    // check if the index part of the name is not an integer
-    std::string index_str = tensor_name.substr(start_pos + 2);
-    for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
-      if (std::isdigit(*itr) == 0) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            ("input '" + tensor_name +
-             "' does not follow <name>__<index> naming convention.")
-                .c_str());
-      }
-    }
-
-    ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
-    input_index_map_[tensor_name] = ip_index;
-  }
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ValidateTypedSequenceControl(
-    triton::common::TritonJson::Value& sequence_batching,
-    const std::string& control_kind, bool required, bool* have_control)
-{
-  std::string tensor_name;
-  std::string tensor_datatype;
-  RETURN_IF_ERROR(GetTypedSequenceControlProperties(
-      sequence_batching, model_state_->Name(), control_kind, required,
-      &tensor_name, &tensor_datatype));
-  *have_control = !tensor_name.empty();
-  if (*have_control) {
-    std::string deliminator = "__";
-    int ip_index = 0;
-    int start_pos = tensor_name.find(deliminator);
-    if (start_pos == -1) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("input '" + tensor_name +
-           "' does not follow <name>__<index> naming convention.")
-              .c_str());
-    }
-
-    // check if the index part of the name is not an integer
-    std::string index_str = tensor_name.substr(start_pos + 2);
-    for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
-      if (std::isdigit(*itr) == 0) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            ("input '" + tensor_name +
-             "' does not follow <name>__<index> naming convention.")
-                .c_str());
-      }
-    }
-
-    // check if the data type is supported by PyTorch
-    if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("input '" + tensor_name + "' type '" + tensor_datatype +
-           "' is not supported by PyTorch.")
-              .c_str());
-    }
-
-    ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
-    input_index_map_[tensor_name] = ip_index;
-  }
-
-  return nullptr;  // success
-}
-
-void
-ModelInstanceState::AddInputToMap(
-    NamingConvention naming_convention,
-    const std::vector<std::string> allowed_inputs, const std::string& io_name,
-    const uint32_t index)
-{
-  std::string deliminator = "__";
-
-  if (is_dict_input_) {
-    // If dictionary, index is irrelevant but we use the map to store the
-    // input names since they are the keys for the dictionary
-    input_index_map_[io_name] = index;
-  } else {
-    switch (naming_convention) {
-      case NamingConvention::FORWARD_ARGUMENT: {
-        auto itr =
-            std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name);
-        if (itr != allowed_inputs.end()) {
-          input_index_map_[io_name] =
-              std::distance(allowed_inputs.begin(), itr);
-        }
-        return;
-      }
-      case NamingConvention::NAMED_INDEX: {
-        int start_pos = io_name.find(deliminator);
-        int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str());
-        input_index_map_[io_name] = ip_index;
-        return;
-      }
-      case NamingConvention::STRICT_CONFIG_ORDERING: {
-        input_index_map_[io_name] = index;
-        return;
-      }
-    }
-  }
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
-{
-  // Collect all the expected input tensor names and validate that the model
-  // configuration specifies only those.
-  std::vector<std::string> allowed_inputs;
-
-  const torch::jit::Method& method = torch_model_->get_method("forward");
-  const auto& schema = method.function().getSchema();
-  const std::vector<c10::Argument>& arguments = schema.arguments();
-
-  // Currently, only models with a single input of type Dict(str, Tensor) are
-  // supported. If the model expects more than one input then they must be all
-  // be of type Tensor.
-  //
-  // Ignore the argument at idx 0 if it is of Class type (self param in forward
-  // function)
-  size_t start_idx = 0;
-  if ((arguments.size() > 0) &&
-      (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) {
-    start_idx = 1;
-  }
-  if ((arguments.size() == (1 + start_idx)) &&
-      (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) {
-    is_dict_input_ = true;
-  } else if (arguments.size() > start_idx) {
-    // Return error if multiple inputs are of kind DictType
-    for (size_t i = start_idx + 1; i < arguments.size(); i++) {
-      if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            "Multiple inputs of kind DictType were detected. Only a single "
-            "input of type Dict(str, Tensor) is supported.");
-      }
-    }
-
-    // Return error if all inputs are not of type Tensor
-    for (size_t i = start_idx; i < arguments.size(); i++) {
-      if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) &&
-          (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            (std::string("An input of type '") + arguments.at(i).type()->str() +
-             "' was detected in the model. Only a single input of type "
-             "Dict(str, Tensor) or input(s) of type Tensor are supported.")
-                .c_str());
-      }
-      allowed_inputs.emplace_back(arguments.at(i).name());
-    }
-
-    // If all inputs are tensors, match number of expected inputs between model
-    // and configuration
-    if ((arguments.size() - start_idx) != expected_input_cnt) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("unable to load model '") + model_state_->Name() +
-           "', configuration expects " + std::to_string(expected_input_cnt) +
-           " inputs, model provides " +
-           std::to_string(arguments.size() - start_idx))
-              .c_str());
-    }
-  }
-
-  triton::common::TritonJson::Value ios;
-  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios));
-
-  if (ios.ArraySize() == 0) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        "model configuration must contain at least one input, none were "
-        "specified.");
-  }
-
-  NamingConvention naming_convention;
-  RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs));
-
-  for (size_t i = 0; i < ios.ArraySize(); i++) {
-    triton::common::TritonJson::Value io;
-    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
-
-    // Validate name
-    std::string io_name;
-    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-    AddInputToMap(naming_convention, allowed_inputs, io_name, i);
-    // Validate data type
-    std::string io_dtype;
-    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
-    const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
-    if (!pr.first && (io_dtype != "TYPE_STRING")) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("unsupported datatype " + io_dtype + " for input '" + io_name +
-           "' for model '" + model_state_->Name() + "'")
-              .c_str());
-    }
-
-    // Validate shape for String inputs. Only allow 1 dimension.
-    if (io_dtype == "TYPE_STRING") {
-      // If a reshape is provided for the input then use that when
-      // validating the model shapes.
-      std::vector<int64_t> dims;
-      triton::common::TritonJson::Value reshape;
-      if (io.Find("reshape", &reshape)) {
-        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
-      } else {
-        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
-      }
-
-      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            ("Triton only supports 1 dimensional List of String as input for "
-             "'" +
-             std::string(io_name) + "' for model '" + model_state_->Name() +
-             "'")
-                .c_str());
-      }
-    }
-  }
-  triton::common::TritonJson::Value sequence_batching;
-  if (model_state_->ModelConfig().Find(
-          "sequence_batching", &sequence_batching)) {
-    triton::common::TritonJson::Value states;
-    if (sequence_batching.Find("state", &states)) {
-      for (size_t i = 0; i < states.ArraySize(); i++) {
-        triton::common::TritonJson::Value state;
-        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
-        std::string state_name;
-        RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name));
-        AddInputToMap(naming_convention, allowed_inputs, state_name, i);
-
-        // Validate data type
-        std::string state_dtype;
-        RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
-        const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
-        if (!pr.first && (state_dtype != "TYPE_STRING")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              ("unsupported datatype " + state_dtype + " for input state '" +
-               state_name + "' for model '" + model_state_->Name() + "'")
-                  .c_str());
-        }
-
-        // Validate shape for String inputs. Only allow 1 dimension.
-        if (state_dtype == "TYPE_STRING") {
-          std::vector<int64_t> dims;
-          if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
-            return TRITONSERVER_ErrorNew(
-                TRITONSERVER_ERROR_INTERNAL,
-                ("Triton only supports 1 dimensional List of String as input "
-                 "for "
-                 "'" +
-                 std::string(state_name) + "' for model '" +
-                 model_state_->Name() + "'")
-                    .c_str());
-          }
-        }
-      }
-    }
-  }
-
-  triton::common::TritonJson::Value batch_inputs;
-  RETURN_IF_ERROR(
-      model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs));
-  size_t i = 0;
-  for (const auto& batch_input : StateForModel()->BatchInputs()) {
-    for (const auto& input_name : batch_input.TargetNames()) {
-      AddInputToMap(
-          naming_convention, allowed_inputs, input_name, i + ios.ArraySize());
-      i++;
-    }
-  }
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ValidateOutputs()
-{
-  triton::common::TritonJson::Value ios;
-  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
-  std::string deliminator = "__";
-  int op_index = 0;
-
-  if (ios.ArraySize() == 0) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        "model configuration must contain at least one output, none were "
-        "specified.");
-  }
-
-  NamingConvention naming_convention;
-  RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {}));
-
-  for (size_t i = 0; i < ios.ArraySize(); i++) {
-    triton::common::TritonJson::Value io;
-    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
-
-    // Validate name
-    std::string io_name;
-    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-    switch (naming_convention) {
-      case NamingConvention::NAMED_INDEX: {
-        int start_pos = io_name.find(deliminator);
-        op_index = std::atoi(io_name.substr(start_pos + 2).c_str());
-        break;
-      }
-      case NamingConvention::STRICT_CONFIG_ORDERING: {
-        op_index = i;
-        break;
-      }
-      default:
-        break;
-    }
-
-    // Validate data type
-    std::string io_dtype;
-    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
-    const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
-    if (!pr.first && (io_dtype != "TYPE_STRING")) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("unsupported datatype " + io_dtype + " for output '" + io_name +
-           "' for model '" + model_state_->Name() + "'")
-              .c_str());
-    }
-
-    // Validate shape for String outputs. Only allow 1 dimension.
-    if (io_dtype == "TYPE_STRING") {
-      // If a reshape is provided for the output then use that when
-      // validating the model shapes.
-      std::vector<int64_t> dims;
-      triton::common::TritonJson::Value reshape;
-      if (io.Find("reshape", &reshape)) {
-        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
-      } else {
-        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
-      }
-
-      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            ("Triton only supports 1 dimensional List of String as output for "
-             "'" +
-             std::string(io_name) + "' for model '" + model_state_->Name() +
-             "'")
-                .c_str());
-      }
-    }
-
-    output_index_map_[io_name] = op_index;
-    output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second);
-  }
-
-  triton::common::TritonJson::Value sequence_batching;
-  if (model_state_->ModelConfig().Find(
-          "sequence_batching", &sequence_batching)) {
-    triton::common::TritonJson::Value states;
-    if (sequence_batching.Find("state", &states)) {
-      for (size_t i = 0; i < states.ArraySize(); i++) {
-        triton::common::TritonJson::Value state;
-        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
-        std::string state_name;
-        RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name));
-        std::string state_dtype;
-        RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
-        std::vector<int64_t> dims;
-        RETURN_IF_ERROR(ParseShape(state, "dims", &dims));
-
-        // For state, naming convention is enforced to be NAMED_INDEX
-        int start_pos = state_name.find(deliminator);
-        op_index = std::atoi(state_name.substr(start_pos + 2).c_str());
-
-        const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
-        if (!pr.first && (state_dtype != "TYPE_STRING")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              ("unsupported datatype " + state_dtype + " for state '" +
-               state_name + "' for model '" + model_state_->Name() + "'")
-                  .c_str());
-        }
-
-        // Validate shape for String outputs. Only allow 1 dimension.
-        if (state_dtype == "TYPE_STRING") {
-          if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
-            return TRITONSERVER_ErrorNew(
-                TRITONSERVER_ERROR_INTERNAL,
-                ("Triton only supports 1 dimensional List of String as output "
-                 "for "
-                 "'" +
-                 std::string(state_name) + "' for model '" +
-                 model_state_->Name() + "'")
-                    .c_str());
-          }
-        }
-
-        output_index_map_[state_name] = op_index;
-        output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second);
-      }
-    }
-  }
-
-  return nullptr;  // success
-}
-
-void
-ModelInstanceState::ProcessRequests(
-    TRITONBACKEND_Request** requests, const uint32_t request_count)
-{
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " +
-       std::to_string(request_count) + " requests")
-          .c_str());
-
-#ifdef TRITON_ENABLE_GPU
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
-    SetCurrentCudaStream(stream_, DeviceId());
-  } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
-    // Replace the default stream of each device with the one we created.
-    for (size_t i = 0; i < stream_vec_.size(); i++) {
-      SetCurrentCudaStream(stream_vec_[i], i);
-    }
-  }
-#endif
-
-  NVTX_RANGE(nvtx_, "ProcessRequests " + Name());
-
-  uint64_t exec_start_ns = 0;
-  SET_TIMESTAMP(exec_start_ns);
-
-  const int max_batch_size = model_state_->MaxBatchSize();
-
-  // For each request collect the total batch size for this inference
-  // execution. The batch-size, number of inputs, and size of each
-  // input has already been checked so don't need to do that here.
-  size_t total_batch_size = 0;
-  for (size_t i = 0; i < request_count; i++) {
-    // If we get a nullptr request then something is badly wrong. Fail
-    // and release all requests.
-    if (requests[i] == nullptr) {
-      RequestsRespondWithError(
-          requests, request_count,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              std::string(
-                  "null request given to PyTorch backend for '" + Name() + "'")
-                  .c_str()));
-      return;
-    }
-  }
-
-  // At this point we are committed to running inference with all
-  // 'requests'. Create a response for each request. During input
-  // processing if there is an error with any request that error will
-  // be sent immediately with the corresponding response (and the
-  // response unique_ptr will then be nullptr). The request object
-  // itself will not be released until after all inferencing is done
-  // (below) as we may need to access the request object when
-  // determine how to process outputs (for example, even if we don't
-  // need the outputs for a request that has an error, we do need to
-  // know the size of those outputs associated with the request so we
-  // can skip them in the output tensors).
-  std::vector<TRITONBACKEND_Response*> responses;
-  responses.reserve(request_count);
-  bool all_response_failed = false;
-
-  for (size_t i = 0; i < request_count; i++) {
-    TRITONBACKEND_Response* response;
-    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
-    if (err == nullptr) {
-      responses.emplace_back(response);
-    } else {
-      responses.emplace_back(nullptr);
-      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
-      TRITONSERVER_ErrorDelete(err);
-    }
-  }
-
-  for (size_t i = 0; i < request_count; i++) {
-    if (max_batch_size > 0) {
-      // Retrieve the batch size from one of the inputs, if the model
-      // supports batching, the first dimension size is batch size.
-      TRITONBACKEND_Input* input;
-      TRITONSERVER_Error* err =
-          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
-      if (err == nullptr) {
-        const int64_t* shape;
-        err = TRITONBACKEND_InputProperties(
-            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
-        total_batch_size += shape[0];
-      }
-      if (err != nullptr) {
-        RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-            responses, request_count, all_response_failed, err);
-      }
-    } else {
-      total_batch_size += 1;
-    }
-  }
-
-  // If there are no valid payloads then no need to run the inference.
-  if (total_batch_size == 0) {
-    return;
-  }
-
-  // Make sure the maximum batch size is not exceeded. The
-  // total_batch_size must be 1 for models that don't support batching
-  // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
-  // scheduler has done something badly wrong so fail and release all
-  // requests.
-  if (!all_response_failed) {
-    if ((total_batch_size != 1) &&
-        (total_batch_size > (size_t)max_batch_size)) {
-      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-          responses, request_count, all_response_failed,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              std::string(
-                  "batch size " + std::to_string(total_batch_size) + " for '" +
-                  Name() + "', max allowed is " +
-                  std::to_string(max_batch_size))
-                  .c_str()));
-    }
-  }
-
-  std::vector<const char*> input_names;
-  std::vector<torch::jit::IValue> input_tensors;
-  bool cuda_copy = false;
-  std::unique_ptr<BackendInputCollector> collector;
-
-  // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute
-  // input duration since only one stream will be used for input collection.
-  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
-      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
-#ifdef TRITON_ENABLE_GPU
-    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-        responses, request_count, all_response_failed,
-        ConvertCUDAStatusToTritonError(
-            cudaEventRecord(
-                compute_input_start_event_, GetCudaStreamByInstanceKind()),
-            TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
-#endif
-  }
-
-  if (!all_response_failed) {
-    collector.reset(new BackendInputCollector(
-        requests, request_count, &responses,
-        model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(),
-        GetCudaStreamByInstanceKind(), nullptr, nullptr, 0,
-        HostPolicyName().c_str()));
-    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-        responses, request_count, all_response_failed,
-        SetInputTensors(
-            total_batch_size, requests, request_count, &responses,
-            collector.get(), &input_names, &input_tensors, &cuda_copy));
-  }
-
-#ifdef TRITON_ENABLE_GPU
-  if (cuda_copy) {
-    cudaStreamSynchronize(GetCudaStreamByInstanceKind());
-    cuda_copy = false;
-  }
-#endif
-
-  std::vector<torch::jit::IValue> output_tensors;
-  uint64_t compute_start_ns = 0;
-  uint64_t compute_infer_start = 0;
-
-  RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-      responses, request_count, all_response_failed,
-      RecordBackendTimestamp(
-          &compute_start_ns,
-          reinterpret_cast<void*>(&compute_infer_start_event_)));
-
-  // For 'KIND_MODEL', capture the timestamp for the compute infer duration.
-  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
-    SET_TIMESTAMP(compute_infer_start);
-  }
-
-  // Run...
-  if (!all_response_failed) {
-    Execute(&responses, request_count, &input_tensors, &output_tensors);
-  }
-
-  // Verify output indices are valid with number of outputs after execution
-  bool invalid_index = false;
-  int max_index = output_tensors.size() - 1;
-
-  if (!all_response_failed) {
-    for (const auto& name : model_state_->ModelOutputs()) {
-      int op_index = output_index_map_[name.first];
-      if ((op_index < 0) || (op_index > max_index)) {
-        RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-            responses, request_count, all_response_failed,
-            TRITONSERVER_ErrorNew(
-                TRITONSERVER_ERROR_INVALID_ARG,
-                std::string(
-                    "The output " + std::string(name.first) +
-                    " in the model configuration refers to an output index "
-                    "which doesn't exist. This model has " +
-                    std::to_string(max_index + 1) + " outputs")
-                    .c_str()));
-        invalid_index = true;
-        break;
-      }
-    }
-  }
-
-#ifdef TRITON_ENABLE_GPU
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
-    // For 'KIND_MODEL', multiple streams will be involved, so we need to call
-    // 'cudaStreamSynchronize' before reading the output tensors.
-    for (auto& stream : stream_vec_) {
-      cudaStreamSynchronize(stream);
-    }
-  }
-#endif
-
-  uint64_t compute_end_ns = 0;
-  uint64_t compute_output_start = 0;
-
-  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
-#ifdef TRITON_ENABLE_GPU
-    SET_TIMESTAMP(compute_output_start);
-#endif
-  } else {
-    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-        responses, request_count, all_response_failed,
-        RecordBackendTimestamp(
-            &compute_end_ns,
-            reinterpret_cast<void*>(&compute_output_start_event_)));
-  }
-
-  if (!all_response_failed) {
-    if (!invalid_index) {
-      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-          responses, request_count, all_response_failed,
-          ReadOutputTensors(
-              total_batch_size, output_tensors, requests, request_count,
-              &responses));
-    }
-  }
-
-  uint64_t exec_end_ns = 0;
-  SET_TIMESTAMP(exec_end_ns);
-
-  // Send all the responses that haven't already been sent because of
-  // an earlier error. Note that the responses are not set to nullptr
-  // here as we need that indication below to determine if the request
-  // we successful or not.
-  for (auto& response : responses) {
-    if (response != nullptr) {
-      LOG_IF_ERROR(
-          TRITONBACKEND_ResponseSend(
-              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
-          "failed to send PyTorch backend response");
-    }
-  }
-
-  // We don't need an explicit CUDA syncrhonization here since we have already
-  // synchronized the stream in the ReadOutputTensors function.
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
-#ifdef TRITON_ENABLE_GPU
-    float compute_input_duration = GetCudaEventElapsedTime(
-        compute_input_start_event_, compute_infer_start_event_);
-    float compute_infer_duration = GetCudaEventElapsedTime(
-        compute_infer_start_event_, compute_output_start_event_);
-
-    compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
-    compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6);
-#endif
-  } else if (
-      (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
-#ifdef TRITON_ENABLE_GPU
-    float compute_input_duration = GetCudaEventElapsedTime(
-        compute_input_start_event_, compute_infer_start_event_);
-    uint64_t compute_infer_duration =
-        compute_output_start - compute_infer_start;
-
-    compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
-    compute_end_ns = compute_start_ns + compute_infer_duration;
-#endif
-  }
-
-  // Report statistics for each request.
-  for (uint32_t r = 0; r < request_count; ++r) {
-    auto& request = requests[r];
-    LOG_IF_ERROR(
-        TRITONBACKEND_ModelInstanceReportStatistics(
-            TritonModelInstance(), request,
-            (responses[r] != nullptr) /* success */, exec_start_ns,
-            compute_start_ns, compute_end_ns, exec_end_ns),
-        "failed reporting request statistics");
-
-    LOG_IF_ERROR(
-        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
-        "failed releasing request");
-  }
-
-  if (!all_response_failed) {
-    // Report the entire batch statistics.
-    LOG_IF_ERROR(
-        TRITONBACKEND_ModelInstanceReportBatchStatistics(
-            TritonModelInstance(), total_batch_size, exec_start_ns,
-            compute_start_ns, compute_end_ns, exec_end_ns),
-        "failed reporting batch request statistics");
-  }
-}
-
-void
-ModelInstanceState::Execute(
-    std::vector<TRITONBACKEND_Response*>* responses,
-    const uint32_t response_count,
-    std::vector<torch::jit::IValue>* input_tensors,
-    std::vector<torch::jit::IValue>* output_tensors)
-{
-  NVTX_RANGE(nvtx_, "Execute " + Name());
-
-  torch::jit::IValue model_outputs_;
-
-  try {
-    // enable/disable optimized execution
-    torch::jit::setGraphExecutorOptimize(
-        model_state_->EnabledOptimizedExecution());
-
-    // enable/disable inference mode - supersedes NoGradGuard
-    torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
-
-    // enable/disable cudnn
-    at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn());
-
-    // JIT. No change is made unless parameter is explicitly set.
-    if (std::get<0>(model_state_->EnabledJitProfiling())) {
-      torch::jit::getProfilingMode() =
-          std::get<1>(model_state_->EnabledJitProfiling());
-    }
-
-    if (std::get<0>(model_state_->EnabledJitExecutor())) {
-      torch::jit::getExecutorMode() =
-          std::get<1>(model_state_->EnabledJitExecutor());
-    }
-
-    // Fuser. No change is made unless fuser is explicitly set in
-    // parameters.
-    if (std::get<0>(model_state_->EnabledTensorExprFuser())) {
-      torch::jit::setTensorExprFuserEnabled(
-          std::get<1>(model_state_->EnabledTensorExprFuser()));
-    }
-
-    torch::NoGradGuard no_grad;
-
-    // If input is a dictionary, prepare dictionary from 'input_tensors'.
-    if (is_dict_input_) {
-      torch::Dict<std::string, torch::Tensor> input_dict;
-      for (auto& input_index : input_index_map_) {
-        torch::jit::IValue ival = (*input_tensors)[input_index.second];
-        input_dict.insert(input_index.first, ival.toTensor());
-      }
-      std::vector<torch::jit::IValue> input_dict_ivalue = {input_dict};
-      model_outputs_ = torch_model_->forward(input_dict_ivalue);
-    } else {
-      model_outputs_ = torch_model_->forward(*input_tensors);
-    }
-
-    if (model_outputs_.isTuple()) {
-      auto model_outputs_tuple = model_outputs_.toTuple();
-      size_t op_index = 0;
-      for (auto& m_op : model_outputs_tuple->elements()) {
-        if (m_op.isList()) {
-          auto list_output = m_op.toList();
-          if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
-            throw std::invalid_argument(
-                "output at index " + std::to_string(op_index) +
-                " must be of type Tensor or List[str], received List[" +
-                list_output.elementType()->str() + "]");
-          }
-          output_tensors->push_back(m_op);
-        } else {
-          auto tensor_output = m_op.toTensor();
-          output_tensors->push_back(m_op);
-        }
-        op_index++;
-      }
-    } else if (model_outputs_.isTensor()) {
-      output_tensors->push_back(model_outputs_);
-    } else if (model_outputs_.isList()) {
-      auto list_output = model_outputs_.toList();
-      if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
-        throw std::invalid_argument(
-            "output must be of type Tensor or List[str], received List[" +
-            list_output.elementType()->str() + "]");
-      }
-      output_tensors->push_back(model_outputs_);
-    } else {
-      throw std::invalid_argument(
-          "output must be of type Tensor, List[str] or Tuple containing one of "
-          "these two types. It should not be a List / Dictionary of Tensors or "
-          "a Scalar");
-    }
-  }
-  catch (std::exception& ex) {
-    SendErrorForResponses(
-        responses, response_count,
-        TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            ("PyTorch execute failure: " + std::string(ex.what())).c_str()));
-  }
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::GetNamingConvention(
-    NamingConvention* naming_convention,
-    const std::vector<std::string>& allowed_ios)
-{
-  // Rules for (non-Dictionary) input tensor names:
-  // 1. Must be in 'allowed_inputs' (arguments in the forward function)
-  // 2. Must follow the naming convention i.e. <name>__<index>
-  // 3. If neither of the above conditions are satisfied, enforce strict
-  // ordering of model inputs.
-  //
-  // Rules for output tensor names:
-  // 1. Must follow the naming convention i.e. <name>__<index>
-  // 2. If not, we enforce strict ordering of model outputs.
-  std::string deliminator = "__";
-  std::string io_kind = "input";
-  *naming_convention = NamingConvention::FORWARD_ARGUMENT;
-
-  // symbolizes output
-  if (allowed_ios.size() == 0) {
-    io_kind = "output";
-    *naming_convention = NamingConvention::NAMED_INDEX;
-  }
-
-  triton::common::TritonJson::Value ios;
-  RETURN_IF_ERROR(
-      model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios));
-
-  if (io_kind == "input") {
-    for (size_t i = 0; i < ios.ArraySize(); i++) {
-      triton::common::TritonJson::Value io;
-      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
-
-      // Validate name
-      std::string io_name;
-      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-      auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name);
-      if (itr == allowed_ios.end()) {
-        *naming_convention = NamingConvention::NAMED_INDEX;
-        break;
-      }
-    }
-  }
-
-  // If not, check if inputs follow INDEX
-  if (*naming_convention == NamingConvention::NAMED_INDEX) {
-    for (size_t i = 0; i < ios.ArraySize(); i++) {
-      triton::common::TritonJson::Value io;
-      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
-
-      // Validate name
-      std::string io_name;
-      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-      int start_pos = io_name.find(deliminator);
-      if (start_pos == -1) {
-        *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
-        break;
-      } else {
-        // check if the index part of the name is not an integer
-        std::string index_str = io_name.substr(start_pos + 2);
-        bool is_int = true;
-        for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
-          if (std::isdigit(*itr) == 0) {
-            is_int = false;
-          }
-        }
-
-        if (!is_int) {
-          if (io_kind == "input") {
-            LOG_MESSAGE(
-                TRITONSERVER_LOG_WARN,
-                ("input '" + io_name +
-                 "' or previous input(s) are neither an input argument to the "
-                 "model '" +
-                 model_state_->Name() +
-                 "' nor do they follow the <name>__<index> naming convention. "
-                 "Falling back to enforcing strict ordering from model "
-                 "configuration.")
-                    .c_str());
-          } else {
-            LOG_MESSAGE(
-                TRITONSERVER_LOG_WARN,
-                ("output '" + io_name +
-                 "' or previous output(s) of the model '" +
-                 model_state_->Name() +
-                 "' do not follow the <name>__<index> naming convention. "
-                 "Falling back to enforcing strict ordering from model "
-                 "configuration.")
-                    .c_str());
-          }
-          *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
-          break;
-        }
-      }
-    }
-  }
-
-  triton::common::TritonJson::Value sequence_batching;
-  if (model_state_->ModelConfig().Find(
-          "sequence_batching", &sequence_batching)) {
-    // If we need to manage state for the model, then we need to check
-    // the naming of the state adheres to both the input and output conventions
-    triton::common::TritonJson::Value states;
-    if (sequence_batching.Find("state", &states)) {
-      if (*naming_convention != NamingConvention::NAMED_INDEX) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            ("PyTorch model '" + model_state_->Name() +
-             "' is using sequence batching with state but not all inputs and "
-             "outputs follow the <name>__<index> naming convention. ")
-                .c_str());
-      }
-    }
-
-    for (size_t i = 0; i < states.ArraySize(); i++) {
-      triton::common::TritonJson::Value state;
-      RETURN_IF_ERROR(states.IndexAsObject(i, &state));
-      std::string name_entry =
-          io_kind == "input" ? "input_name" : "output_name";
-      std::string state_name;
-      RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name));
-      int start_pos = state_name.find(deliminator);
-      if (start_pos == -1) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            ("PyTorch model '" + model_state_->Name() +
-             "' is using sequence batching with state but state '" +
-             state_name +
-             "' does not follow the <name>__<index> naming convention. ")
-                .c_str());
-      } else {
-        // check if the index part of the name is not an integer
-        std::string index_str = state_name.substr(start_pos + 2);
-        bool is_int = true;
-        for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
-          if (std::isdigit(*itr) == 0) {
-            is_int = false;
-          }
-        }
-        if (!is_int) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              ("PyTorch model '" + model_state_->Name() +
-               "' is using sequence batching with state but state '" +
-               state_name +
-               "' does not follow the <name>__<index> naming convention. ")
-                  .c_str());
-        }
-      }
-    }
-  }
-
-  return nullptr;  // success
-}
-
-// This function will return a tensor's contents as a contiguous
-// chunk in system memory. In some cases this will require copying the data.
-// If that  happens, 'contiguous_buffer' will be set to hold the contiguous
-// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
-// conducted.  The data copy can be avoided if the input is already in
-// a contiguous chunk and the input is located in memory type and id
-// specified.
-TRITONSERVER_Error*
-GetContiguousInputContent(
-    TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
-    const char** content, size_t* content_byte_size,
-    std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy)
-{
-  *cuda_copy = false;
-
-  // Check input buffers to see if data copy is necessary
-  size_t chunk_count = 0;
-  bool type_mismatch = false;
-  uint64_t total_byte_size = 0;
-  for (size_t idx = 0; idx < buffer_count; ++idx) {
-    TRITONSERVER_MemoryType src_memory_type;
-    int64_t src_memory_type_id;
-    size_t src_byte_size;
-    const void* src_ptr;
-
-    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
-        rinput, idx, &src_ptr, &src_byte_size, &src_memory_type,
-        &src_memory_type_id));
-
-    if (src_ptr != nullptr) {
-      chunk_count++;
-      total_byte_size += src_byte_size;
-      type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU);
-    }
-  }
-
-  if (chunk_count == 0) {
-    *content = nullptr;
-    *content_byte_size = 0;
-  } else if ((chunk_count == 1) && !type_mismatch) {
-    TRITONSERVER_MemoryType src_memory_type;
-    int64_t src_memory_type_id;
-    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
-        rinput, 0, (const void**)content, content_byte_size, &src_memory_type,
-        &src_memory_type_id));
-  } else {
-    contiguous_buffer->resize(total_byte_size);
-
-    size_t offset = 0;
-    for (size_t i = 0; i < chunk_count; i++) {
-      bool cuda_used;
-      TRITONSERVER_MemoryType src_memory_type;
-      int64_t src_memory_type_id;
-      size_t src_byte_size;
-      const void* src_ptr;
-
-      RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
-          rinput, i, &src_ptr, &src_byte_size, &src_memory_type,
-          &src_memory_type_id));
-      RETURN_IF_ERROR(CopyBuffer(
-          "Contiguous input", src_memory_type, src_memory_type_id,
-          TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr,
-          contiguous_buffer->data() + offset, stream, &cuda_used));
-      *cuda_copy |= cuda_used;
-      offset += src_byte_size;
-    }
-
-    *content = contiguous_buffer->data();
-    *content_byte_size = total_byte_size;
-  }
-
-  return nullptr;  // success
-}
-
-void
-FillStringTensor(torch::List<std::string>* input_list, const size_t cnt)
-{
-  for (size_t c = 0; c < cnt; ++c) {
-    input_list->push_back("");
-  }
-}
-
-bool
-SetStringInputTensor(
-    torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
-    const char* name, const uint32_t buffer_count,
-    const size_t request_element_cnt, TRITONBACKEND_Response** response,
-    cudaStream_t stream, const char* host_policy_name)
-{
-  bool cuda_copy = false;
-
-  // For string data type, we always need to have the data on CPU so
-  // that we can read string length and construct the string
-  // properly. So if the request's input tensor is not in CPU need to
-  // copy it there.
-  const char* content = nullptr;
-  size_t content_byte_size = 0;
-
-  std::vector<char> contiguous_buffer;
-  auto err = GetContiguousInputContent(
-      input, buffer_count, &content, &content_byte_size, &contiguous_buffer,
-      stream, &cuda_copy);
-  if (err != nullptr) {
-    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
-    FillStringTensor(input_list, request_element_cnt);
-    return cuda_copy;
-  }
-
-#ifdef TRITON_ENABLE_GPU
-  if (cuda_copy) {
-    cudaStreamSynchronize(stream);
-    cuda_copy = false;
-  }
-#endif  // TRITON_ENABLE_GPU
-
-  std::vector<std::pair<const char*, const uint32_t>> str_list;
-  err = ValidateStringBuffer(
-      content, content_byte_size, request_element_cnt, name, &str_list);
-  // Set string values.
-  for (const auto& [addr, len] : str_list) {
-    input_list->push_back(std::string(addr, len));
-  }
-
-  size_t element_cnt = str_list.size();
-  if (err != nullptr) {
-    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
-    FillStringTensor(input_list, request_element_cnt - element_cnt);
-  }
-  return cuda_copy;
-}
-
-bool
-SetStringBuffer(
-    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
-    TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
-    const size_t tensor_element_count, cudaStream_t stream,
-    std::string* serialized, bool state)
-{
-  bool cuda_copy = false;
-
-  // Serialize the output tensor strings. Each string is serialized as
-  // a 4-byte length followed by the string itself with no
-  // null-terminator.
-  serialized->clear();
-  for (size_t e = 0; e < tensor_element_count; ++e) {
-    std::string str = tensor->get(e).to<std::string>();
-    const char* cstr = str.c_str();
-    size_t len = str.length();
-    serialized->append(reinterpret_cast<const char*>(&len), sizeof(uint32_t));
-    if (len > 0) {
-      serialized->append(cstr, len);
-    }
-  }
-
-  // Allocate a buffer large enough to hold the serialized tensor.
-  TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
-  int64_t actual_memory_type_id = 0;
-
-  TRITONSERVER_Error* err;
-  void* buffer;
-
-  if (!state) {
-    auto err = TRITONBACKEND_OutputBuffer(
-        response_output, &buffer, serialized->size(), &actual_memory_type,
-        &actual_memory_type_id);
-    if (err != nullptr) {
-      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
-      return cuda_copy;
-    }
-  } else {
-    auto err = TRITONBACKEND_StateBuffer(
-        response_state, &buffer, serialized->size(), &actual_memory_type,
-        &actual_memory_type_id);
-    if (err != nullptr) {
-      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
-      return cuda_copy;
-    }
-  }
-  // Copy the serialized tensor into the allocated buffer.
-  bool cuda_used = false;
-  err = CopyBuffer(
-      "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */,
-      0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id,
-      serialized->size(), reinterpret_cast<const void*>(serialized->c_str()),
-      buffer, stream, &cuda_used);
-  cuda_copy |= cuda_used;
-
-  if (err != nullptr) {
-    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
-    return cuda_copy;
-  }
-
-  if (state) {
-    RESPOND_AND_SET_NULL_IF_ERROR(
-        response, TRITONBACKEND_StateUpdate(response_state));
-  }
-
-  return cuda_copy;
-}
-
-
-bool
-SetStringOutputBuffer(
-    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
-    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
-    cudaStream_t stream, std::string* serialized)
-{
-  return SetStringBuffer(
-      tensor, response, response_output, nullptr /* response_state */,
-      tensor_element_count, stream, serialized, false /* state */);
-}
-
-bool
-SetStringStateBuffer(
-    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
-    TRITONBACKEND_State* response_state, const size_t tensor_element_count,
-    cudaStream_t stream, std::string* serialized)
-{
-  return SetStringBuffer(
-      tensor, response, nullptr /* response_output */, response_state,
-      tensor_element_count, stream, serialized, true /* state */);
-}
-
-
-TRITONSERVER_Error*
-ModelInstanceState::SetInputTensors(
-    size_t total_batch_size, TRITONBACKEND_Request** requests,
-    const uint32_t request_count,
-    std::vector<TRITONBACKEND_Response*>* responses,
-    BackendInputCollector* collector, std::vector<const char*>* input_names,
-    std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy)
-{
-  // InferenceMode should be used to guard all tensors operations
-  torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
-
-  // All requests must have equally-sized input tensors so use any
-  // request as the representative for the input tensors.
-  uint32_t input_count;
-  RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count));
-
-  input_tensors->resize(input_count + batch_input_count_);
-
-  // The inputs must be in contiguous CPU/GPU memory.
-  std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
-  if (device_.is_cpu()) {
-    alloc_perference = {
-        {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
-  } else {
-    alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
-  }
-
-  for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
-    TRITONBACKEND_Input* input;
-    RETURN_IF_ERROR(
-        TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
-
-    const char* input_name;
-    TRITONSERVER_DataType input_datatype;
-    const int64_t* input_shape;
-    uint32_t input_dims_count;
-    RETURN_IF_ERROR(TRITONBACKEND_InputProperties(
-        input, &input_name, &input_datatype, &input_shape, &input_dims_count,
-        nullptr, nullptr));
-
-    input_names->emplace_back(input_name);
-
-    // The shape for the entire input patch,
-    // [total_batch_size, ...] for non-ragged input and
-    // [total_element_count] for ragged input (non-nested tensor)
-    std::vector<int64_t> batchn_shape;
-    if (StateForModel()->IsInputRagged(input_name)) {
-      batchn_shape = std::vector<int64_t>{0};
-      for (size_t idx = 0; idx < request_count; idx++) {
-        TRITONBACKEND_Input* input;
-        RESPOND_AND_SET_NULL_IF_ERROR(
-            &((*responses)[idx]),
-            TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
-        const int64_t* input_shape;
-        uint32_t input_dims_count;
-        RESPOND_AND_SET_NULL_IF_ERROR(
-            &((*responses)[idx]), TRITONBACKEND_InputProperties(
-                                      input, nullptr, nullptr, &input_shape,
-                                      &input_dims_count, nullptr, nullptr));
-
-        int64_t element_cnt = 0;
-        RESPOND_AND_SET_NULL_IF_ERROR(
-            &((*responses)[idx]),
-            GetElementCount(input_shape, input_dims_count, &element_cnt));
-        batchn_shape[0] += element_cnt;
-      }
-    } else {
-      batchn_shape =
-          std::vector<int64_t>(input_shape, input_shape + input_dims_count);
-      if (supports_batching_) {
-        batchn_shape[0] = total_batch_size;
-      }
-    }
-
-    // The input must be in contiguous CPU/GPU memory.
-    std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
-    // For 'KIND_MODEL', input will always be in CPU as we don't have a way to
-    // query the input types.
-    if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) {
-      alloc_perference = {
-          {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
-    } else {
-      alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
-    }
-
-    const char* input_buffer;
-    size_t batchn_byte_size;
-    TRITONSERVER_MemoryType memory_type;
-    int64_t memory_type_id;
-    RETURN_IF_ERROR(collector->ProcessTensor(
-        input_name, nullptr, 0, alloc_perference, &input_buffer,
-        &batchn_byte_size, &memory_type, &memory_type_id));
-
-    // Create Torch tensor
-    const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype);
-    torch::TensorOptions options{torch_dtype.second};
-    auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU)
-                               ? options.device(torch::kCUDA, device_.index())
-                               : options.device(torch::kCPU);
-
-    if (input_datatype == TRITONSERVER_TYPE_BYTES) {
-      // Create the PyTorch list to hold the strings.
-      torch::List<std::string> input_list;
-      input_list.reserve(batchn_shape[0]);
-
-      for (size_t idx = 0; idx < request_count; idx++) {
-        TRITONBACKEND_Input* input;
-        RESPOND_AND_SET_NULL_IF_ERROR(
-            &((*responses)[idx]),
-            TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
-        const int64_t* shape;
-        uint32_t dims_count;
-        uint32_t buffer_count;
-        RESPOND_AND_SET_NULL_IF_ERROR(
-            &((*responses)[idx]),
-            TRITONBACKEND_InputPropertiesForHostPolicy(
-                input, HostPolicyName().c_str(), nullptr, nullptr, &shape,
-                &dims_count, nullptr, &buffer_count));
-
-        int64_t batch_element_cnt = 0;
-        RESPOND_AND_SET_NULL_IF_ERROR(
-            &((*responses)[idx]),
-            GetElementCount(shape, dims_count, &batch_element_cnt));
-
-        *cuda_copy |= SetStringInputTensor(
-            &input_list, input, input_name, buffer_count, batch_element_cnt,
-            &((*responses)[idx]), GetCudaStreamByInstanceKind(),
-            HostPolicyName().c_str());
-      }
-
-      (*input_tensors)[input_index_map_[input_name]] = input_list;
-    } else {
-      if (batchn_byte_size) {
-        // Remove constness to align with the signature of torch::from_blob()
-        torch::Tensor input_tensor = torch::from_blob(
-            const_cast<char*>(input_buffer), batchn_shape, updated_options);
-        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
-      } else {
-        // torch:from_blob seems not working when the input size is 0
-        // create zero-length inputs directly
-        torch::Tensor input_tensor =
-            torch::zeros(batchn_shape, updated_options);
-        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
-      }
-    }
-  }
-
-  for (const auto& batch_input : StateForModel()->BatchInputs()) {
-    std::vector<int64_t> shape;
-    collector->BatchInputShape(batch_input, &shape);
-
-    for (const auto& input_name : batch_input.TargetNames()) {
-      input_names->emplace_back(input_name.c_str());
-
-      const char* dst_buffer;
-      size_t dst_buffer_byte_size;
-      TRITONSERVER_MemoryType dst_memory_type;
-      int64_t dst_memory_type_id;
-
-      RESPOND_ALL_AND_SET_NULL_IF_ERROR(
-          (*responses), responses->size(),
-          collector->ProcessBatchInput(
-              batch_input, nullptr, 0, alloc_perference, &dst_buffer,
-              &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id));
-
-      const auto torch_dtype =
-          ConvertDataTypeToTorchType(batch_input.DataType());
-      torch::TensorOptions options{torch_dtype.second};
-      auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU)
-                                 ? options.device(torch::kCUDA, device_.index())
-                                 : options.device(torch::kCPU);
-
-      if (dst_buffer_byte_size) {
-        torch::Tensor input_tensor = torch::from_blob(
-            const_cast<char*>(dst_buffer), shape, updated_options);
-        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
-      } else {
-        // special handle when input has zero size
-        torch::Tensor input_tensor = torch::zeros(shape, updated_options);
-        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
-      }
-    }
-  }
-
-  // Finalize...
-  *cuda_copy |= collector->Finalize();
-
-  return nullptr;
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ReadOutputTensors(
-    size_t total_batch_size,
-    const std::vector<torch::jit::IValue>& output_tensors,
-    TRITONBACKEND_Request** requests, const uint32_t request_count,
-    std::vector<TRITONBACKEND_Response*>* responses)
-{
-  NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name());
-
-  BackendOutputResponder responder(
-      requests, request_count, responses, model_state_->TritonMemoryManager(),
-      model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(),
-      GetCudaStreamByInstanceKind());
-
-  bool cuda_copy = false;
-  // The serialized string buffer must be valid until output copies are done
-  std::vector<std::unique_ptr<std::string>> string_buffer;
-  for (auto& output : model_state_->ModelOutputs()) {
-    int op_index = output_index_map_[output.first];
-    auto name = output.first;
-    auto output_tensor_pair = output.second;
-
-    if (output_tensors[op_index].isTensor()) {
-      torch::Tensor output_flat;
-      try {
-        output_flat =
-            output_tensors[op_index].toTensor().contiguous().flatten();
-      }
-      catch (std::exception& ex) {
-        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            (std::string("output tensor '") + name + "' is not found")
-                .c_str()));
-      }
-
-      // Verify output datatype matches datatype from model config
-      TRITONSERVER_DataType output_dtype =
-          ConvertTorchTypeToDataType(output_flat.scalar_type());
-      TRITONSERVER_DataType config_datatype = output_dtype_map_[name];
-      if (config_datatype != output_dtype) {
-        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INVALID_ARG,
-            (std::string("configuration expects datatype TYPE_") +
-             TRITONSERVER_DataTypeString(config_datatype) + " for output '" +
-             name + "', model provides TYPE_" +
-             TRITONSERVER_DataTypeString(output_dtype))
-                .c_str()));
-      }
-
-      const char* output_buffer =
-          static_cast<const char*>(output_flat.data_ptr());
-
-      // Output tensors may not reside on the same device as model
-      torch::Device tensor_device = output_flat.device();
-      const auto memory_type = (tensor_device.type() == torch::kCPU)
-                                   ? TRITONSERVER_MEMORY_CPU
-                                   : TRITONSERVER_MEMORY_GPU;
-      const auto memory_id =
-          (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index();
-
-      // Batch output doesn't support string data type yet, as it is not trivial
-      // to parse string output
-      const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name);
-      if (batch_output == nullptr) {
-        // Get output shape
-        std::vector<int64_t> batchn_shape;
-        auto shape = output_tensors[op_index].toTensor().sizes();
-        for (auto itr = shape.begin(); itr != shape.end(); itr++) {
-          batchn_shape.push_back(*itr);
-        }
-
-        if (batchn_shape.size() == 0) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              (std::string("output '") + name +
-               "' is a scalar which is not supported.")
-                  .c_str());
-        }
-        if (output_tensor_pair.first != -1) {
-          responder.ProcessTensor(
-              name, output_dtype, batchn_shape, output_buffer, memory_type,
-              memory_id);
-        }
-        if (output_tensor_pair.second != -1) {
-          std::vector<TRITONBACKEND_State*> states;
-          states = responder.ProcessStateTensor(
-              name, output_dtype, batchn_shape, output_buffer, memory_type,
-              memory_id);
-          // Update the states
-          for (auto& state : states) {
-            RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state));
-          }
-        }
-
-      } else {
-        responder.ProcessBatchOutput(
-            name, *batch_output, output_buffer, memory_type, memory_id);
-      }
-    } else if (output_tensors[op_index].isList()) {
-      // Custom handling for string/bytes tensor...
-      torch::List<torch::jit::IValue> output_list =
-          output_tensors[op_index].toList();
-
-      // Get output shape
-      std::vector<int64_t> batchn_shape{(int64_t)output_list.size()};
-
-      for (size_t idx = 0; idx < responses->size(); idx++) {
-        auto& request = requests[idx];
-        auto& response = (*responses)[idx];
-
-        if (supports_batching_ != 0) {
-          TRITONBACKEND_Input* input;
-          TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input);
-          const int64_t* shape;
-          TRITONBACKEND_InputProperties(
-              input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
-          batchn_shape[0] = shape[0];
-        }
-
-        int64_t tensor_element_cnt = 0;
-        RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt));
-
-        // Only need an response tensor for requested outputs.
-        if (response != nullptr) {
-          if (output_tensor_pair.first != -1) {
-            TRITONBACKEND_Output* response_output;
-            RESPOND_AND_SET_NULL_IF_ERROR(
-                &response, TRITONBACKEND_ResponseOutput(
-                               response, &response_output, name.c_str(),
-                               TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
-                               batchn_shape.size()));
-            string_buffer.emplace_back(new std::string());
-            cuda_copy |= SetStringOutputBuffer(
-                &output_list, &response, response_output, tensor_element_cnt,
-                GetCudaStreamByInstanceKind(), string_buffer.back().get());
-          }
-        }
-        if (output_tensor_pair.second != -1) {
-          TRITONBACKEND_State* response_state;
-          RESPOND_AND_SET_NULL_IF_ERROR(
-              &response, TRITONBACKEND_StateNew(
-                             &response_state, request, name.c_str(),
-                             TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
-                             batchn_shape.size()));
-
-          string_buffer.emplace_back(new std::string());
-          cuda_copy |= SetStringStateBuffer(
-              &output_list, &response, response_state, tensor_element_cnt,
-              GetCudaStreamByInstanceKind(), string_buffer.back().get());
-        }
-      }
-    } else {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("output '") + name +
-           "' must be of type Tensor or List[str].")
-              .c_str());
-    }
-  }
-
-  // Finalize and wait for any pending buffer copies.
-  cuda_copy |= responder.Finalize();
-
-#ifdef TRITON_ENABLE_GPU
-  // We have to always synchronize the stream. This is to make sure that
-  // the events on the cuda stream are synchronized. Otherwise, the events
-  // are only guaranteed to be synchronized if the model provides the output
-  // on GPU.
-  cudaStreamSynchronize(GetCudaStreamByInstanceKind());
-#endif
-
-  return nullptr;
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::RecordBackendTimestamp(
-    uint64_t* timestamp, void* cuda_event)
-{
-  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
-      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
-#ifdef TRITON_ENABLE_GPU
-    cudaEvent_t* lcuda_event = reinterpret_cast<cudaEvent_t*>(cuda_event);
-    RETURN_IF_ERROR(ConvertCUDAStatusToTritonError(
-        cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()),
-        TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
-#endif
-  } else {
-    SET_TIMESTAMP(*timestamp);
-  }
-  return nullptr;
-}
-
-void
-ModelInstanceState::CreateCudaEvents(const int32_t& device_id)
-{
-#ifdef TRITON_ENABLE_GPU
-  // Need to set the CUDA context so that the context that events are
-  // created on match with contexts that events are recorded with.
-  THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
-      cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL,
-      "Failed to set the device"));
-  THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
-      cudaEventCreate(&compute_input_start_event_), TRITONSERVER_ERROR_INTERNAL,
-      "Failed to create cuda event"));
-  THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
-      cudaEventCreate(&compute_infer_start_event_), TRITONSERVER_ERROR_INTERNAL,
-      "Failed to create cuda event"));
-  THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError(
-      cudaEventCreate(&compute_output_start_event_),
-      TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event"));
-#endif
-}
-
-cudaStream_t
-ModelInstanceState::GetCudaStreamByInstanceKind()
-{
-#ifdef TRITON_ENABLE_GPU
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
-    return stream_;
-  } else if (
-      (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) &&
-      !stream_vec_.empty()) {
-    return stream_vec_[0];
-  }
-#endif
-  return nullptr;
-}
-
-void
-ModelInstanceState::SetCurrentCudaStream(
-    const cudaStream_t& stream, const int& device_id)
-{
-#ifdef TRITON_ENABLE_GPU
-  at::cuda::CUDAStream torch_stream =
-      at::cuda::getStreamFromExternal(stream, device_id);
-  // This function replaces the default stream with the stream we created. It
-  // is not necessary to change the current device to the desired device when
-  // replacing the default stream for that device. See the documentation here:
-  // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html
-  at::cuda::setCurrentCUDAStream(torch_stream);
-#endif
-}
-
-float
-ModelInstanceState::GetCudaEventElapsedTime(
-    const cudaEvent_t& start_event, const cudaEvent_t& end_event)
-{
-  float duration = 0;
-#ifdef TRITON_ENABLE_GPU
-  // [FIXME] in the case of cudaEventElapsedTime failure, should handle
-  // stats reporting more gracefully as the durations are inaccurate
-  LOG_IF_ERROR(
-      ConvertCUDAStatusToTritonError(
-          cudaEventElapsedTime(&duration, start_event, end_event),
-          TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
-      "Failed to capture elapsed time");
-#endif
-  return duration;
-}
-
-/////////////
+namespace triton::backend::pytorch {
 
 extern "C" {
 
@@ -2704,8 +219,8 @@ TRITONBACKEND_ModelInstanceExecute(
   }
 
   return nullptr;  // success
-}
+};
 
 }  // extern "C"
 
-}}}  // namespace triton::backend::pytorch
+}  // namespace triton::backend::pytorch
diff --git a/src/libtorch.hh b/src/libtorch.hh
new file mode 100644
index 0000000..263c340
--- /dev/null
+++ b/src/libtorch.hh
@@ -0,0 +1,59 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "model_instance_state.hh"
+#include "model_state.hh"
+#include "naming_convention.hh"
+#include "string_utilities.hh"
+
+//
+// PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
+//
+
+namespace triton::backend::pytorch {
+
+extern "C" {
+
+TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(
+    TRITONBACKEND_ModelInstance* instance);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(
+    TRITONBACKEND_ModelInstance* instance);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+    const uint32_t request_count);
+
+}  // extern "C"
+
+
+}  // namespace triton::backend::pytorch
diff --git a/src/model_instance_state.cc b/src/model_instance_state.cc
new file mode 100644
index 0000000..7cd5ee3
--- /dev/null
+++ b/src/model_instance_state.cc
@@ -0,0 +1,1632 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "model_instance_state.hh"
+#include "string_utilities.hh"
+
+#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
+// Suppress warnings in torch headers
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma warning(push, 0)
+#include <torchvision/ops/ops.h>
+#include <torchvision/vision.h>  // Torchvision header
+#pragma warning(pop)
+#pragma GCC diagnostic pop
+#endif  // TRITON_PYTORCH_ENABLE_TORCHVISION
+
+#ifdef TRITON_ENABLE_GPU
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+
+namespace triton::backend::pytorch {
+
+ModelInstanceState::ModelInstanceState(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
+    : BackendModelInstance(model_state, triton_model_instance),
+      model_state_(model_state), device_(torch::kCPU), is_dict_input_(false),
+      device_cnt_(0)
+{
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    device_ = torch::Device(torch::kCUDA, DeviceId());
+    CreateCudaEvents(DeviceId());
+#endif
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  device_cnt_ = torch::cuda::device_count();
+#endif
+
+  THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(
+      ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_));
+
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+#ifdef TRITON_ENABLE_GPU
+    // Since we cannot determine the exact devices used by the model, we create
+    // a CUDA stream for every available device to ensure proper synchronization
+    // of CUDA streams. This approach may have implications when a timestamp is
+    // captured on a device that is not used by the model. Currently, this issue
+    // is addressed by synchronizing the CUDA streams before recording
+    // timestamps to prevent timestamp skewing. However, in the future, any
+    // modifications to the CUDA stream synchronization logic should be handled
+    // with caution.
+    for (int i = 0; i < device_cnt_; i++) {
+      cudaStream_t stream;
+      THROW_IF_BACKEND_INSTANCE_ERROR(
+          CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream));
+      stream_vec_.push_back(stream);
+    }
+    if (!stream_vec_.empty()) {
+      // Create CUDA events on the first device that will be used for collecting
+      // inputs/outputs.
+      CreateCudaEvents(0);
+    }
+#endif
+  }
+
+  size_t expected_input_cnt = 0;
+  {
+    triton::common::TritonJson::Value inputs;
+    if (model_state->ModelConfig().Find("input", &inputs)) {
+      expected_input_cnt = inputs.ArraySize();
+    }
+
+    triton::common::TritonJson::Value config_batch_inputs;
+    if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) {
+      batch_input_count_ = config_batch_inputs.ArraySize();
+      expected_input_cnt += batch_input_count_;
+    }
+  }
+
+  // If this is a sequence model then make sure that the required
+  // inputs are present in the model and have the correct shape and
+  // datatype.
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    bool have_start, have_end, have_ready, have_corrid;
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_START", false /* required */,
+        &have_start));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_END", false /* required */,
+        &have_end));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */,
+        &have_ready));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */,
+        &have_corrid));
+    if (have_start) {
+      expected_input_cnt += 1;
+    }
+    if (have_end) {
+      expected_input_cnt += 1;
+    }
+    if (have_ready) {
+      expected_input_cnt += 1;
+    }
+    if (have_corrid) {
+      expected_input_cnt += 1;
+    }
+    // Add the state inputs to the expected count
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      expected_input_cnt += states.ArraySize();
+    }
+  }
+  supports_batching_ = model_state_->MaxBatchSize() > 0;
+
+  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt));
+  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
+}
+
+ModelInstanceState::~ModelInstanceState()
+{
+  torch_model_.reset();
+  ClearCache();
+
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+#ifdef TRITON_ENABLE_GPU
+    for (size_t i = 0; i < stream_vec_.size(); i++) {
+      LOG_IF_ERROR(
+          ConvertCUDAStatusToTritonError(
+              cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL,
+              "Failed to set the device"),
+          "Failed to set the device");
+
+      LOG_IF_ERROR(
+          ConvertCUDAStatusToTritonError(
+              cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL,
+              "Failed to destroy cuda stream"),
+          "~ModelInstanceState error: ");
+      stream_vec_[i] = nullptr;
+    }
+#endif
+  }
+}
+
+void
+ModelInstanceState::AddInputToMap(
+    NamingConvention naming_convention,
+    const std::vector<std::string> allowed_inputs, const std::string& io_name,
+    const uint32_t index)
+{
+  std::string deliminator = "__";
+
+  if (is_dict_input_) {
+    // If dictionary, index is irrelevant but we use the map to store the
+    // input names since they are the keys for the dictionary
+    input_index_map_[io_name] = index;
+  } else {
+    switch (naming_convention) {
+      case NamingConvention::FORWARD_ARGUMENT: {
+        auto itr =
+            std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name);
+        if (itr != allowed_inputs.end()) {
+          input_index_map_[io_name] =
+              std::distance(allowed_inputs.begin(), itr);
+        }
+        return;
+      }
+      case NamingConvention::NAMED_INDEX: {
+        int start_pos = io_name.find(deliminator);
+        int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str());
+        input_index_map_[io_name] = ip_index;
+        return;
+      }
+      case NamingConvention::STRICT_CONFIG_ORDERING: {
+        input_index_map_[io_name] = index;
+        return;
+      }
+    }
+  }
+}
+
+void
+ModelInstanceState::ClearCache()
+{
+#ifdef TRITON_ENABLE_GPU
+  if (device_.is_cuda() ||
+      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
+    c10::cuda::CUDACachingAllocator::emptyCache();
+  }
+#endif  // TRITON_ENABLE_GPU
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::Create(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
+    ModelInstanceState** state)
+{
+  try {
+    *state = new ModelInstanceState(model_state, triton_model_instance);
+  }
+  catch (const BackendModelInstanceException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelInstanceException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  return nullptr;  // success
+}
+
+void
+ModelInstanceState::Execute(
+    std::vector<TRITONBACKEND_Response*>* responses,
+    const uint32_t response_count,
+    std::vector<torch::jit::IValue>* input_tensors,
+    std::vector<torch::jit::IValue>* output_tensors)
+{
+  NVTX_RANGE(nvtx_, "Execute " + Name());
+
+  torch::jit::IValue model_outputs_;
+
+  try {
+    // enable/disable optimized execution
+    torch::jit::setGraphExecutorOptimize(
+        model_state_->EnabledOptimizedExecution());
+
+    // enable/disable inference mode - supersedes NoGradGuard
+    torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
+
+    // enable/disable cudnn
+    at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn());
+
+    // JIT. No change is made unless parameter is explicitly set.
+    if (std::get<0>(model_state_->EnabledJitProfiling())) {
+      torch::jit::getProfilingMode() =
+          std::get<1>(model_state_->EnabledJitProfiling());
+    }
+
+    if (std::get<0>(model_state_->EnabledJitExecutor())) {
+      torch::jit::getExecutorMode() =
+          std::get<1>(model_state_->EnabledJitExecutor());
+    }
+
+    // Fuser. No change is made unless fuser is explicitly set in
+    // parameters.
+    if (std::get<0>(model_state_->EnabledTensorExprFuser())) {
+      torch::jit::setTensorExprFuserEnabled(
+          std::get<1>(model_state_->EnabledTensorExprFuser()));
+    }
+
+    torch::NoGradGuard no_grad;
+
+    // If input is a dictionary, prepare dictionary from 'input_tensors'.
+    if (is_dict_input_) {
+      torch::Dict<std::string, torch::Tensor> input_dict;
+      for (auto& input_index : input_index_map_) {
+        torch::jit::IValue ival = (*input_tensors)[input_index.second];
+        input_dict.insert(input_index.first, ival.toTensor());
+      }
+      std::vector<torch::jit::IValue> input_dict_ivalue = {input_dict};
+      model_outputs_ = torch_model_->forward(input_dict_ivalue);
+    } else {
+      model_outputs_ = torch_model_->forward(*input_tensors);
+    }
+
+    if (model_outputs_.isTuple()) {
+      auto model_outputs_tuple = model_outputs_.toTuple();
+      size_t op_index = 0;
+      for (auto& m_op : model_outputs_tuple->elements()) {
+        if (m_op.isList()) {
+          auto list_output = m_op.toList();
+          if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
+            throw std::invalid_argument(
+                "output at index " + std::to_string(op_index) +
+                " must be of type Tensor or List[str], received List[" +
+                list_output.elementType()->str() + "]");
+          }
+          output_tensors->push_back(m_op);
+        } else {
+          auto tensor_output = m_op.toTensor();
+          output_tensors->push_back(m_op);
+        }
+        op_index++;
+      }
+    } else if (model_outputs_.isTensor()) {
+      output_tensors->push_back(model_outputs_);
+    } else if (model_outputs_.isList()) {
+      auto list_output = model_outputs_.toList();
+      if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
+        throw std::invalid_argument(
+            "output must be of type Tensor or List[str], received List[" +
+            list_output.elementType()->str() + "]");
+      }
+      output_tensors->push_back(model_outputs_);
+    } else {
+      throw std::invalid_argument(
+          "output must be of type Tensor, List[str] or Tuple containing one of "
+          "these two types. It should not be a List / Dictionary of Tensors or "
+          "a Scalar");
+    }
+  }
+  catch (std::exception& ex) {
+    SendErrorForResponses(
+        responses, response_count,
+        TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("PyTorch execute failure: " + std::string(ex.what())).c_str()));
+  }
+}
+
+float
+ModelInstanceState::GetCudaEventElapsedTime(
+    const cudaEvent_t& start_event, const cudaEvent_t& end_event)
+{
+  float duration = 0;
+#ifdef TRITON_ENABLE_GPU
+  // [FIXME] in the case of cudaEventElapsedTime failure, should handle
+  // stats reporting more gracefully as the durations are inaccurate
+  LOG_IF_ERROR(
+      ConvertCUDAStatusToTritonError(
+          cudaEventElapsedTime(&duration, start_event, end_event),
+          TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
+      "Failed to capture elapsed time");
+#endif
+  return duration;
+}
+
+
+cudaStream_t
+ModelInstanceState::GetCudaStreamByInstanceKind()
+{
+#ifdef TRITON_ENABLE_GPU
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+    return stream_;
+  } else if (
+      (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) &&
+      !stream_vec_.empty()) {
+    return stream_vec_[0];
+  }
+#endif
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::GetNamingConvention(
+    NamingConvention* naming_convention,
+    const std::vector<std::string>& allowed_ios)
+{
+  // Rules for (non-Dictionary) input tensor names:
+  // 1. Must be in 'allowed_inputs' (arguments in the forward function)
+  // 2. Must follow the naming convention i.e. <name>__<index>
+  // 3. If neither of the above conditions are satisfied, enforce strict
+  // ordering of model inputs.
+  //
+  // Rules for output tensor names:
+  // 1. Must follow the naming convention i.e. <name>__<index>
+  // 2. If not, we enforce strict ordering of model outputs.
+  std::string deliminator = "__";
+  std::string io_kind = "input";
+  *naming_convention = NamingConvention::FORWARD_ARGUMENT;
+
+  // symbolizes output
+  if (allowed_ios.size() == 0) {
+    io_kind = "output";
+    *naming_convention = NamingConvention::NAMED_INDEX;
+  }
+
+  triton::common::TritonJson::Value ios;
+  RETURN_IF_ERROR(
+      model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios));
+
+  if (io_kind == "input") {
+    for (size_t i = 0; i < ios.ArraySize(); i++) {
+      triton::common::TritonJson::Value io;
+      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+      // Validate name
+      std::string io_name;
+      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+      auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name);
+      if (itr == allowed_ios.end()) {
+        *naming_convention = NamingConvention::NAMED_INDEX;
+        break;
+      }
+    }
+  }
+
+  // If not, check if inputs follow INDEX
+  if (*naming_convention == NamingConvention::NAMED_INDEX) {
+    for (size_t i = 0; i < ios.ArraySize(); i++) {
+      triton::common::TritonJson::Value io;
+      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+      // Validate name
+      std::string io_name;
+      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+      int start_pos = io_name.find(deliminator);
+      if (start_pos == -1) {
+        *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
+        break;
+      } else {
+        // check if the index part of the name is not an integer
+        std::string index_str = io_name.substr(start_pos + 2);
+        bool is_int = true;
+        for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+          if (std::isdigit(*itr) == 0) {
+            is_int = false;
+          }
+        }
+
+        if (!is_int) {
+          if (io_kind == "input") {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_WARN,
+                ("input '" + io_name +
+                 "' or previous input(s) are neither an input argument to the "
+                 "model '" +
+                 model_state_->Name() +
+                 "' nor do they follow the <name>__<index> naming convention. "
+                 "Falling back to enforcing strict ordering from model "
+                 "configuration.")
+                    .c_str());
+          } else {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_WARN,
+                ("output '" + io_name +
+                 "' or previous output(s) of the model '" +
+                 model_state_->Name() +
+                 "' do not follow the <name>__<index> naming convention. "
+                 "Falling back to enforcing strict ordering from model "
+                 "configuration.")
+                    .c_str());
+          }
+          *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
+          break;
+        }
+      }
+    }
+  }
+
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state_->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    // If we need to manage state for the model, then we need to check
+    // the naming of the state adheres to both the input and output conventions
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      if (*naming_convention != NamingConvention::NAMED_INDEX) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            ("PyTorch model '" + model_state_->Name() +
+             "' is using sequence batching with state but not all inputs and "
+             "outputs follow the <name>__<index> naming convention. ")
+                .c_str());
+      }
+    }
+
+    for (size_t i = 0; i < states.ArraySize(); i++) {
+      triton::common::TritonJson::Value state;
+      RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+      std::string name_entry =
+          io_kind == "input" ? "input_name" : "output_name";
+      std::string state_name;
+      RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name));
+      int start_pos = state_name.find(deliminator);
+      if (start_pos == -1) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            ("PyTorch model '" + model_state_->Name() +
+             "' is using sequence batching with state but state '" +
+             state_name +
+             "' does not follow the <name>__<index> naming convention. ")
+                .c_str());
+      } else {
+        // check if the index part of the name is not an integer
+        std::string index_str = state_name.substr(start_pos + 2);
+        bool is_int = true;
+        for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+          if (std::isdigit(*itr) == 0) {
+            is_int = false;
+          }
+        }
+        if (!is_int) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              ("PyTorch model '" + model_state_->Name() +
+               "' is using sequence batching with state but state '" +
+               state_name +
+               "' does not follow the <name>__<index> naming convention. ")
+                  .c_str());
+        }
+      }
+    }
+  }
+
+  return nullptr;  // success
+}
+
+void
+ModelInstanceState::ProcessRequests(
+    TRITONBACKEND_Request** requests, const uint32_t request_count)
+{
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " +
+       std::to_string(request_count) + " requests")
+          .c_str());
+
+#ifdef TRITON_ENABLE_GPU
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+    SetCurrentCudaStream(stream_, DeviceId());
+  } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+    // Replace the default stream of each device with the one we created.
+    for (size_t i = 0; i < stream_vec_.size(); i++) {
+      SetCurrentCudaStream(stream_vec_[i], i);
+    }
+  }
+#endif
+
+  NVTX_RANGE(nvtx_, "ProcessRequests " + Name());
+
+  uint64_t exec_start_ns = 0;
+  SET_TIMESTAMP(exec_start_ns);
+
+  const int max_batch_size = model_state_->MaxBatchSize();
+
+  // For each request collect the total batch size for this inference
+  // execution. The batch-size, number of inputs, and size of each
+  // input has already been checked so don't need to do that here.
+  size_t total_batch_size = 0;
+  for (size_t i = 0; i < request_count; i++) {
+    // If we get a nullptr request then something is badly wrong. Fail
+    // and release all requests.
+    if (requests[i] == nullptr) {
+      RequestsRespondWithError(
+          requests, request_count,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              std::string(
+                  "null request given to PyTorch backend for '" + Name() + "'")
+                  .c_str()));
+      return;
+    }
+  }
+
+  // At this point we are committed to running inference with all
+  // 'requests'. Create a response for each request. During input
+  // processing if there is an error with any request that error will
+  // be sent immediately with the corresponding response (and the
+  // response unique_ptr will then be nullptr). The request object
+  // itself will not be released until after all inferencing is done
+  // (below) as we may need to access the request object when
+  // determine how to process outputs (for example, even if we don't
+  // need the outputs for a request that has an error, we do need to
+  // know the size of those outputs associated with the request so we
+  // can skip them in the output tensors).
+  std::vector<TRITONBACKEND_Response*> responses;
+  responses.reserve(request_count);
+  bool all_response_failed = false;
+
+  for (size_t i = 0; i < request_count; i++) {
+    TRITONBACKEND_Response* response;
+    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
+    if (err == nullptr) {
+      responses.emplace_back(response);
+    } else {
+      responses.emplace_back(nullptr);
+      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
+      TRITONSERVER_ErrorDelete(err);
+    }
+  }
+
+  for (size_t i = 0; i < request_count; i++) {
+    if (max_batch_size > 0) {
+      // Retrieve the batch size from one of the inputs, if the model
+      // supports batching, the first dimension size is batch size.
+      TRITONBACKEND_Input* input;
+      TRITONSERVER_Error* err =
+          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
+      if (err == nullptr) {
+        const int64_t* shape;
+        err = TRITONBACKEND_InputProperties(
+            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+        total_batch_size += shape[0];
+      }
+      if (err != nullptr) {
+        RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+            responses, request_count, all_response_failed, err);
+      }
+    } else {
+      total_batch_size += 1;
+    }
+  }
+
+  // If there are no valid payloads then no need to run the inference.
+  if (total_batch_size == 0) {
+    return;
+  }
+
+  // Make sure the maximum batch size is not exceeded. The
+  // total_batch_size must be 1 for models that don't support batching
+  // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
+  // scheduler has done something badly wrong so fail and release all
+  // requests.
+  if (!all_response_failed) {
+    if ((total_batch_size != 1) &&
+        (total_batch_size > (size_t)max_batch_size)) {
+      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+          responses, request_count, all_response_failed,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              std::string(
+                  "batch size " + std::to_string(total_batch_size) + " for '" +
+                  Name() + "', max allowed is " +
+                  std::to_string(max_batch_size))
+                  .c_str()));
+    }
+  }
+
+  std::vector<const char*> input_names;
+  std::vector<torch::jit::IValue> input_tensors;
+  bool cuda_copy = false;
+  std::unique_ptr<BackendInputCollector> collector;
+
+  // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute
+  // input duration since only one stream will be used for input collection.
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
+      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
+#ifdef TRITON_ENABLE_GPU
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        ConvertCUDAStatusToTritonError(
+            cudaEventRecord(
+                compute_input_start_event_, GetCudaStreamByInstanceKind()),
+            TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
+#endif
+  }
+
+  if (!all_response_failed) {
+    collector.reset(new BackendInputCollector(
+        requests, request_count, &responses,
+        model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(),
+        GetCudaStreamByInstanceKind(), nullptr, nullptr, 0,
+        HostPolicyName().c_str()));
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        SetInputTensors(
+            total_batch_size, requests, request_count, &responses,
+            collector.get(), &input_names, &input_tensors, &cuda_copy));
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(GetCudaStreamByInstanceKind());
+    cuda_copy = false;
+  }
+#endif
+
+  std::vector<torch::jit::IValue> output_tensors;
+  uint64_t compute_start_ns = 0;
+  uint64_t compute_infer_start = 0;
+
+  RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+      responses, request_count, all_response_failed,
+      RecordBackendTimestamp(
+          &compute_start_ns,
+          reinterpret_cast<void*>(&compute_infer_start_event_)));
+
+  // For 'KIND_MODEL', capture the timestamp for the compute infer duration.
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
+    SET_TIMESTAMP(compute_infer_start);
+  }
+
+  // Run...
+  if (!all_response_failed) {
+    Execute(&responses, request_count, &input_tensors, &output_tensors);
+  }
+
+  // Verify output indices are valid with number of outputs after execution
+  bool invalid_index = false;
+  int max_index = output_tensors.size() - 1;
+
+  if (!all_response_failed) {
+    for (const auto& name : model_state_->ModelOutputs()) {
+      int op_index = output_index_map_[name.first];
+      if ((op_index < 0) || (op_index > max_index)) {
+        RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+            responses, request_count, all_response_failed,
+            TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INVALID_ARG,
+                std::string(
+                    "The output " + std::string(name.first) +
+                    " in the model configuration refers to an output index "
+                    "which doesn't exist. This model has " +
+                    std::to_string(max_index + 1) + " outputs")
+                    .c_str()));
+        invalid_index = true;
+        break;
+      }
+    }
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+    // For 'KIND_MODEL', multiple streams will be involved, so we need to call
+    // 'cudaStreamSynchronize' before reading the output tensors.
+    for (auto& stream : stream_vec_) {
+      cudaStreamSynchronize(stream);
+    }
+  }
+#endif
+
+  uint64_t compute_end_ns = 0;
+  uint64_t compute_output_start = 0;
+
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
+#ifdef TRITON_ENABLE_GPU
+    SET_TIMESTAMP(compute_output_start);
+#endif
+  } else {
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        RecordBackendTimestamp(
+            &compute_end_ns,
+            reinterpret_cast<void*>(&compute_output_start_event_)));
+  }
+
+  if (!all_response_failed) {
+    if (!invalid_index) {
+      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+          responses, request_count, all_response_failed,
+          ReadOutputTensors(
+              total_batch_size, output_tensors, requests, request_count,
+              &responses));
+    }
+  }
+
+  uint64_t exec_end_ns = 0;
+  SET_TIMESTAMP(exec_end_ns);
+
+  // Send all the responses that haven't already been sent because of
+  // an earlier error. Note that the responses are not set to nullptr
+  // here as we need that indication below to determine if the request
+  // we successful or not.
+  for (auto& response : responses) {
+    if (response != nullptr) {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ResponseSend(
+              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
+          "failed to send PyTorch backend response");
+    }
+  }
+
+  // We don't need an explicit CUDA syncrhonization here since we have already
+  // synchronized the stream in the ReadOutputTensors function.
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    float compute_input_duration = GetCudaEventElapsedTime(
+        compute_input_start_event_, compute_infer_start_event_);
+    float compute_infer_duration = GetCudaEventElapsedTime(
+        compute_infer_start_event_, compute_output_start_event_);
+
+    compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
+    compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6);
+#endif
+  } else if (
+      (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
+#ifdef TRITON_ENABLE_GPU
+    float compute_input_duration = GetCudaEventElapsedTime(
+        compute_input_start_event_, compute_infer_start_event_);
+    uint64_t compute_infer_duration =
+        compute_output_start - compute_infer_start;
+
+    compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
+    compute_end_ns = compute_start_ns + compute_infer_duration;
+#endif
+  }
+
+  // Report statistics for each request.
+  for (uint32_t r = 0; r < request_count; ++r) {
+    auto& request = requests[r];
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportStatistics(
+            TritonModelInstance(), request,
+            (responses[r] != nullptr) /* success */, exec_start_ns,
+            compute_start_ns, compute_end_ns, exec_end_ns),
+        "failed reporting request statistics");
+
+    LOG_IF_ERROR(
+        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
+        "failed releasing request");
+  }
+
+  if (!all_response_failed) {
+    // Report the entire batch statistics.
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportBatchStatistics(
+            TritonModelInstance(), total_batch_size, exec_start_ns,
+            compute_start_ns, compute_end_ns, exec_end_ns),
+        "failed reporting batch request statistics");
+  }
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ReadOutputTensors(
+    size_t total_batch_size,
+    const std::vector<torch::jit::IValue>& output_tensors,
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    std::vector<TRITONBACKEND_Response*>* responses)
+{
+  NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name());
+
+  BackendOutputResponder responder(
+      requests, request_count, responses, model_state_->TritonMemoryManager(),
+      model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(),
+      GetCudaStreamByInstanceKind());
+
+  bool cuda_copy = false;
+  // The serialized string buffer must be valid until output copies are done
+  std::vector<std::unique_ptr<std::string>> string_buffer;
+  for (auto& output : model_state_->ModelOutputs()) {
+    int op_index = output_index_map_[output.first];
+    auto name = output.first;
+    auto output_tensor_pair = output.second;
+
+    if (output_tensors[op_index].isTensor()) {
+      torch::Tensor output_flat;
+      try {
+        output_flat =
+            output_tensors[op_index].toTensor().contiguous().flatten();
+      }
+      catch (std::exception& ex) {
+        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            (std::string("output tensor '") + name + "' is not found")
+                .c_str()));
+      }
+
+      // Verify output datatype matches datatype from model config
+      TRITONSERVER_DataType output_dtype =
+          ConvertTorchTypeToDataType(output_flat.scalar_type());
+      TRITONSERVER_DataType config_datatype = output_dtype_map_[name];
+      if (config_datatype != output_dtype) {
+        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string("configuration expects datatype TYPE_") +
+             TRITONSERVER_DataTypeString(config_datatype) + " for output '" +
+             name + "', model provides TYPE_" +
+             TRITONSERVER_DataTypeString(output_dtype))
+                .c_str()));
+      }
+
+      const char* output_buffer =
+          static_cast<const char*>(output_flat.data_ptr());
+
+      // Output tensors may not reside on the same device as model
+      torch::Device tensor_device = output_flat.device();
+      const auto memory_type = (tensor_device.type() == torch::kCPU)
+                                   ? TRITONSERVER_MEMORY_CPU
+                                   : TRITONSERVER_MEMORY_GPU;
+      const auto memory_id =
+          (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index();
+
+      // Batch output doesn't support string data type yet, as it is not trivial
+      // to parse string output
+      const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name);
+      if (batch_output == nullptr) {
+        // Get output shape
+        std::vector<int64_t> batchn_shape;
+        auto shape = output_tensors[op_index].toTensor().sizes();
+        for (auto itr = shape.begin(); itr != shape.end(); itr++) {
+          batchn_shape.push_back(*itr);
+        }
+
+        if (batchn_shape.size() == 0) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string("output '") + name +
+               "' is a scalar which is not supported.")
+                  .c_str());
+        }
+        if (output_tensor_pair.first != -1) {
+          responder.ProcessTensor(
+              name, output_dtype, batchn_shape, output_buffer, memory_type,
+              memory_id);
+        }
+        if (output_tensor_pair.second != -1) {
+          std::vector<TRITONBACKEND_State*> states;
+          states = responder.ProcessStateTensor(
+              name, output_dtype, batchn_shape, output_buffer, memory_type,
+              memory_id);
+          // Update the states
+          for (auto& state : states) {
+            RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state));
+          }
+        }
+
+      } else {
+        responder.ProcessBatchOutput(
+            name, *batch_output, output_buffer, memory_type, memory_id);
+      }
+    } else if (output_tensors[op_index].isList()) {
+      // Custom handling for string/bytes tensor...
+      torch::List<torch::jit::IValue> output_list =
+          output_tensors[op_index].toList();
+
+      // Get output shape
+      std::vector<int64_t> batchn_shape{(int64_t)output_list.size()};
+
+      for (size_t idx = 0; idx < responses->size(); idx++) {
+        auto& request = requests[idx];
+        auto& response = (*responses)[idx];
+
+        if (supports_batching_ != 0) {
+          TRITONBACKEND_Input* input;
+          TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input);
+          const int64_t* shape;
+          TRITONBACKEND_InputProperties(
+              input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+          batchn_shape[0] = shape[0];
+        }
+
+        int64_t tensor_element_cnt = 0;
+        RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt));
+
+        // Only need an response tensor for requested outputs.
+        if (response != nullptr) {
+          if (output_tensor_pair.first != -1) {
+            TRITONBACKEND_Output* response_output;
+            RESPOND_AND_SET_NULL_IF_ERROR(
+                &response, TRITONBACKEND_ResponseOutput(
+                               response, &response_output, name.c_str(),
+                               TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
+                               batchn_shape.size()));
+            string_buffer.emplace_back(new std::string());
+            cuda_copy |= SetStringOutputBuffer(
+                &output_list, &response, response_output, tensor_element_cnt,
+                GetCudaStreamByInstanceKind(), string_buffer.back().get());
+          }
+        }
+        if (output_tensor_pair.second != -1) {
+          TRITONBACKEND_State* response_state;
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &response, TRITONBACKEND_StateNew(
+                             &response_state, request, name.c_str(),
+                             TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
+                             batchn_shape.size()));
+
+          string_buffer.emplace_back(new std::string());
+          cuda_copy |= SetStringStateBuffer(
+              &output_list, &response, response_state, tensor_element_cnt,
+              GetCudaStreamByInstanceKind(), string_buffer.back().get());
+        }
+      }
+    } else {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("output '") + name +
+           "' must be of type Tensor or List[str].")
+              .c_str());
+    }
+  }
+
+  // Finalize and wait for any pending buffer copies.
+  cuda_copy |= responder.Finalize();
+
+#ifdef TRITON_ENABLE_GPU
+  // We have to always synchronize the stream. This is to make sure that
+  // the events on the cuda stream are synchronized. Otherwise, the events
+  // are only guaranteed to be synchronized if the model provides the output
+  // on GPU.
+  cudaStreamSynchronize(GetCudaStreamByInstanceKind());
+#endif
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::RecordBackendTimestamp(
+    uint64_t* timestamp, void* cuda_event)
+{
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
+      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
+#ifdef TRITON_ENABLE_GPU
+    cudaEvent_t* lcuda_event = reinterpret_cast<cudaEvent_t*>(cuda_event);
+    RETURN_IF_ERROR(ConvertCUDAStatusToTritonError(
+        cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()),
+        TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
+#endif
+  } else {
+    SET_TIMESTAMP(*timestamp);
+  }
+  return nullptr;
+}
+
+void
+ModelInstanceState::SetCurrentCudaStream(
+    const cudaStream_t& stream, const int& device_id)
+{
+#ifdef TRITON_ENABLE_GPU
+  at::cuda::CUDAStream torch_stream =
+      at::cuda::getStreamFromExternal(stream, device_id);
+  // This function replaces the default stream with the stream we created. It
+  // is not necessary to change the current device to the desired device when
+  // replacing the default stream for that device. See the documentation here:
+  // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html
+  at::cuda::setCurrentCUDAStream(torch_stream);
+#endif
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::SetInputTensors(
+    size_t total_batch_size, TRITONBACKEND_Request** requests,
+    const uint32_t request_count,
+    std::vector<TRITONBACKEND_Response*>* responses,
+    BackendInputCollector* collector, std::vector<const char*>* input_names,
+    std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy)
+{
+  // InferenceMode should be used to guard all tensors operations
+  torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
+
+  // All requests must have equally-sized input tensors so use any
+  // request as the representative for the input tensors.
+  uint32_t input_count;
+  RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count));
+
+  input_tensors->resize(input_count + batch_input_count_);
+
+  // The inputs must be in contiguous CPU/GPU memory.
+  std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
+  if (device_.is_cpu()) {
+    alloc_perference = {
+        {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
+  } else {
+    alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
+  }
+
+  for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
+    TRITONBACKEND_Input* input;
+    RETURN_IF_ERROR(
+        TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
+
+    const char* input_name;
+    TRITONSERVER_DataType input_datatype;
+    const int64_t* input_shape;
+    uint32_t input_dims_count;
+    RETURN_IF_ERROR(TRITONBACKEND_InputProperties(
+        input, &input_name, &input_datatype, &input_shape, &input_dims_count,
+        nullptr, nullptr));
+
+    input_names->emplace_back(input_name);
+
+    // The shape for the entire input patch,
+    // [total_batch_size, ...] for non-ragged input and
+    // [total_element_count] for ragged input (non-nested tensor)
+    std::vector<int64_t> batchn_shape;
+    if (StateForModel()->IsInputRagged(input_name)) {
+      batchn_shape = std::vector<int64_t>{0};
+      for (size_t idx = 0; idx < request_count; idx++) {
+        TRITONBACKEND_Input* input;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
+        const int64_t* input_shape;
+        uint32_t input_dims_count;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]), TRITONBACKEND_InputProperties(
+                                      input, nullptr, nullptr, &input_shape,
+                                      &input_dims_count, nullptr, nullptr));
+
+        int64_t element_cnt = 0;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            GetElementCount(input_shape, input_dims_count, &element_cnt));
+        batchn_shape[0] += element_cnt;
+      }
+    } else {
+      batchn_shape =
+          std::vector<int64_t>(input_shape, input_shape + input_dims_count);
+      if (supports_batching_) {
+        batchn_shape[0] = total_batch_size;
+      }
+    }
+
+    // The input must be in contiguous CPU/GPU memory.
+    std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
+    // For 'KIND_MODEL', input will always be in CPU as we don't have a way to
+    // query the input types.
+    if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) {
+      alloc_perference = {
+          {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
+    } else {
+      alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
+    }
+
+    const char* input_buffer;
+    size_t batchn_byte_size;
+    TRITONSERVER_MemoryType memory_type;
+    int64_t memory_type_id;
+    RETURN_IF_ERROR(collector->ProcessTensor(
+        input_name, nullptr, 0, alloc_perference, &input_buffer,
+        &batchn_byte_size, &memory_type, &memory_type_id));
+
+    // Create Torch tensor
+    const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype);
+    torch::TensorOptions options{torch_dtype.second};
+    auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU)
+                               ? options.device(torch::kCUDA, device_.index())
+                               : options.device(torch::kCPU);
+
+    if (input_datatype == TRITONSERVER_TYPE_BYTES) {
+      // Create the PyTorch list to hold the strings.
+      torch::List<std::string> input_list;
+      input_list.reserve(batchn_shape[0]);
+
+      for (size_t idx = 0; idx < request_count; idx++) {
+        TRITONBACKEND_Input* input;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
+        const int64_t* shape;
+        uint32_t dims_count;
+        uint32_t buffer_count;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            TRITONBACKEND_InputPropertiesForHostPolicy(
+                input, HostPolicyName().c_str(), nullptr, nullptr, &shape,
+                &dims_count, nullptr, &buffer_count));
+
+        int64_t batch_element_cnt = 0;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            GetElementCount(shape, dims_count, &batch_element_cnt));
+
+        *cuda_copy |= SetStringInputTensor(
+            &input_list, input, input_name, buffer_count, batch_element_cnt,
+            &((*responses)[idx]), GetCudaStreamByInstanceKind(),
+            HostPolicyName().c_str());
+      }
+
+      (*input_tensors)[input_index_map_[input_name]] = input_list;
+    } else {
+      if (batchn_byte_size) {
+        // Remove constness to align with the signature of torch::from_blob()
+        torch::Tensor input_tensor = torch::from_blob(
+            const_cast<char*>(input_buffer), batchn_shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      } else {
+        // torch:from_blob seems not working when the input size is 0
+        // create zero-length inputs directly
+        torch::Tensor input_tensor =
+            torch::zeros(batchn_shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      }
+    }
+  }
+
+  for (const auto& batch_input : StateForModel()->BatchInputs()) {
+    std::vector<int64_t> shape;
+    collector->BatchInputShape(batch_input, &shape);
+
+    for (const auto& input_name : batch_input.TargetNames()) {
+      input_names->emplace_back(input_name.c_str());
+
+      const char* dst_buffer;
+      size_t dst_buffer_byte_size;
+      TRITONSERVER_MemoryType dst_memory_type;
+      int64_t dst_memory_type_id;
+
+      RESPOND_ALL_AND_SET_NULL_IF_ERROR(
+          (*responses), responses->size(),
+          collector->ProcessBatchInput(
+              batch_input, nullptr, 0, alloc_perference, &dst_buffer,
+              &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id));
+
+      const auto torch_dtype =
+          ConvertDataTypeToTorchType(batch_input.DataType());
+      torch::TensorOptions options{torch_dtype.second};
+      auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU)
+                                 ? options.device(torch::kCUDA, device_.index())
+                                 : options.device(torch::kCPU);
+
+      if (dst_buffer_byte_size) {
+        torch::Tensor input_tensor = torch::from_blob(
+            const_cast<char*>(dst_buffer), shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      } else {
+        // special handle when input has zero size
+        torch::Tensor input_tensor = torch::zeros(shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      }
+    }
+  }
+
+  // Finalize...
+  *cuda_copy |= collector->Finalize();
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateBooleanSequenceControl(
+    triton::common::TritonJson::Value& sequence_batching,
+    const std::string& control_kind, bool required, bool* have_control)
+{
+  std::string tensor_name;
+  std::string tensor_datatype;
+  RETURN_IF_ERROR(GetBooleanSequenceControlProperties(
+      sequence_batching, model_state_->Name(), control_kind, required,
+      &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr,
+      nullptr, nullptr));
+  *have_control = !tensor_name.empty();
+  if (*have_control) {
+    std::string deliminator = "__";
+    int ip_index = 0;
+    int start_pos = tensor_name.find(deliminator);
+    if (start_pos == -1) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("input '" + tensor_name +
+           "' does not follow <name>__<index> naming convention.")
+              .c_str());
+    }
+
+    // check if the index part of the name is not an integer
+    std::string index_str = tensor_name.substr(start_pos + 2);
+    for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+      if (std::isdigit(*itr) == 0) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("input '" + tensor_name +
+             "' does not follow <name>__<index> naming convention.")
+                .c_str());
+      }
+    }
+
+    ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
+    input_index_map_[tensor_name] = ip_index;
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
+{
+  // Collect all the expected input tensor names and validate that the model
+  // configuration specifies only those.
+  std::vector<std::string> allowed_inputs;
+
+  const torch::jit::Method& method = torch_model_->get_method("forward");
+  const auto& schema = method.function().getSchema();
+  const std::vector<c10::Argument>& arguments = schema.arguments();
+
+  // Currently, only models with a single input of type Dict(str, Tensor) are
+  // supported. If the model expects more than one input then they must be all
+  // be of type Tensor.
+  //
+  // Ignore the argument at idx 0 if it is of Class type (self param in forward
+  // function)
+  size_t start_idx = 0;
+  if ((arguments.size() > 0) &&
+      (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) {
+    start_idx = 1;
+  }
+  if ((arguments.size() == (1 + start_idx)) &&
+      (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) {
+    is_dict_input_ = true;
+  } else if (arguments.size() > start_idx) {
+    // Return error if multiple inputs are of kind DictType
+    for (size_t i = start_idx + 1; i < arguments.size(); i++) {
+      if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            "Multiple inputs of kind DictType were detected. Only a single "
+            "input of type Dict(str, Tensor) is supported.");
+      }
+    }
+
+    // Return error if all inputs are not of type Tensor
+    for (size_t i = start_idx; i < arguments.size(); i++) {
+      if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) &&
+          (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            (std::string("An input of type '") + arguments.at(i).type()->str() +
+             "' was detected in the model. Only a single input of type "
+             "Dict(str, Tensor) or input(s) of type Tensor are supported.")
+                .c_str());
+      }
+      allowed_inputs.emplace_back(arguments.at(i).name());
+    }
+
+    // If all inputs are tensors, match number of expected inputs between model
+    // and configuration
+    if ((arguments.size() - start_idx) != expected_input_cnt) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("unable to load model '") + model_state_->Name() +
+           "', configuration expects " + std::to_string(expected_input_cnt) +
+           " inputs, model provides " +
+           std::to_string(arguments.size() - start_idx))
+              .c_str());
+    }
+  }
+
+  triton::common::TritonJson::Value ios;
+  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios));
+
+  if (ios.ArraySize() == 0) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "model configuration must contain at least one input, none were "
+        "specified.");
+  }
+
+  NamingConvention naming_convention;
+  RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs));
+
+  for (size_t i = 0; i < ios.ArraySize(); i++) {
+    triton::common::TritonJson::Value io;
+    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+    // Validate name
+    std::string io_name;
+    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+    AddInputToMap(naming_convention, allowed_inputs, io_name, i);
+    // Validate data type
+    std::string io_dtype;
+    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
+    const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
+    if (!pr.first && (io_dtype != "TYPE_STRING")) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("unsupported datatype " + io_dtype + " for input '" + io_name +
+           "' for model '" + model_state_->Name() + "'")
+              .c_str());
+    }
+
+    // Validate shape for String inputs. Only allow 1 dimension.
+    if (io_dtype == "TYPE_STRING") {
+      // If a reshape is provided for the input then use that when
+      // validating the model shapes.
+      std::vector<int64_t> dims;
+      triton::common::TritonJson::Value reshape;
+      if (io.Find("reshape", &reshape)) {
+        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
+      } else {
+        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
+      }
+
+      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("Triton only supports 1 dimensional List of String as input for "
+             "'" +
+             std::string(io_name) + "' for model '" + model_state_->Name() +
+             "'")
+                .c_str());
+      }
+    }
+  }
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state_->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      for (size_t i = 0; i < states.ArraySize(); i++) {
+        triton::common::TritonJson::Value state;
+        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+        std::string state_name;
+        RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name));
+        AddInputToMap(naming_convention, allowed_inputs, state_name, i);
+
+        // Validate data type
+        std::string state_dtype;
+        RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
+        const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
+        if (!pr.first && (state_dtype != "TYPE_STRING")) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              ("unsupported datatype " + state_dtype + " for input state '" +
+               state_name + "' for model '" + model_state_->Name() + "'")
+                  .c_str());
+        }
+
+        // Validate shape for String inputs. Only allow 1 dimension.
+        if (state_dtype == "TYPE_STRING") {
+          std::vector<int64_t> dims;
+          if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+            return TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INTERNAL,
+                ("Triton only supports 1 dimensional List of String as input "
+                 "for "
+                 "'" +
+                 std::string(state_name) + "' for model '" +
+                 model_state_->Name() + "'")
+                    .c_str());
+          }
+        }
+      }
+    }
+  }
+
+  triton::common::TritonJson::Value batch_inputs;
+  RETURN_IF_ERROR(
+      model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs));
+  size_t i = 0;
+  for (const auto& batch_input : StateForModel()->BatchInputs()) {
+    for (const auto& input_name : batch_input.TargetNames()) {
+      AddInputToMap(
+          naming_convention, allowed_inputs, input_name, i + ios.ArraySize());
+      i++;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateOutputs()
+{
+  triton::common::TritonJson::Value ios;
+  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
+  std::string deliminator = "__";
+  int op_index = 0;
+
+  if (ios.ArraySize() == 0) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "model configuration must contain at least one output, none were "
+        "specified.");
+  }
+
+  NamingConvention naming_convention;
+  RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {}));
+
+  for (size_t i = 0; i < ios.ArraySize(); i++) {
+    triton::common::TritonJson::Value io;
+    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+    // Validate name
+    std::string io_name;
+    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+    switch (naming_convention) {
+      case NamingConvention::NAMED_INDEX: {
+        int start_pos = io_name.find(deliminator);
+        op_index = std::atoi(io_name.substr(start_pos + 2).c_str());
+        break;
+      }
+      case NamingConvention::STRICT_CONFIG_ORDERING: {
+        op_index = i;
+        break;
+      }
+      default:
+        break;
+    }
+
+    // Validate data type
+    std::string io_dtype;
+    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
+    const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
+    if (!pr.first && (io_dtype != "TYPE_STRING")) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("unsupported datatype " + io_dtype + " for output '" + io_name +
+           "' for model '" + model_state_->Name() + "'")
+              .c_str());
+    }
+
+    // Validate shape for String outputs. Only allow 1 dimension.
+    if (io_dtype == "TYPE_STRING") {
+      // If a reshape is provided for the output then use that when
+      // validating the model shapes.
+      std::vector<int64_t> dims;
+      triton::common::TritonJson::Value reshape;
+      if (io.Find("reshape", &reshape)) {
+        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
+      } else {
+        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
+      }
+
+      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("Triton only supports 1 dimensional List of String as output for "
+             "'" +
+             std::string(io_name) + "' for model '" + model_state_->Name() +
+             "'")
+                .c_str());
+      }
+    }
+
+    output_index_map_[io_name] = op_index;
+    output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second);
+  }
+
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state_->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      for (size_t i = 0; i < states.ArraySize(); i++) {
+        triton::common::TritonJson::Value state;
+        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+        std::string state_name;
+        RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name));
+        std::string state_dtype;
+        RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
+        std::vector<int64_t> dims;
+        RETURN_IF_ERROR(ParseShape(state, "dims", &dims));
+
+        // For state, naming convention is enforced to be NAMED_INDEX
+        int start_pos = state_name.find(deliminator);
+        op_index = std::atoi(state_name.substr(start_pos + 2).c_str());
+
+        const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
+        if (!pr.first && (state_dtype != "TYPE_STRING")) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              ("unsupported datatype " + state_dtype + " for state '" +
+               state_name + "' for model '" + model_state_->Name() + "'")
+                  .c_str());
+        }
+
+        // Validate shape for String outputs. Only allow 1 dimension.
+        if (state_dtype == "TYPE_STRING") {
+          if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+            return TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INTERNAL,
+                ("Triton only supports 1 dimensional List of String as output "
+                 "for "
+                 "'" +
+                 std::string(state_name) + "' for model '" +
+                 model_state_->Name() + "'")
+                    .c_str());
+          }
+        }
+
+        output_index_map_[state_name] = op_index;
+        output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second);
+      }
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateTypedSequenceControl(
+    triton::common::TritonJson::Value& sequence_batching,
+    const std::string& control_kind, bool required, bool* have_control)
+{
+  std::string tensor_name;
+  std::string tensor_datatype;
+  RETURN_IF_ERROR(GetTypedSequenceControlProperties(
+      sequence_batching, model_state_->Name(), control_kind, required,
+      &tensor_name, &tensor_datatype));
+  *have_control = !tensor_name.empty();
+  if (*have_control) {
+    std::string deliminator = "__";
+    int ip_index = 0;
+    int start_pos = tensor_name.find(deliminator);
+    if (start_pos == -1) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("input '" + tensor_name +
+           "' does not follow <name>__<index> naming convention.")
+              .c_str());
+    }
+
+    // check if the index part of the name is not an integer
+    std::string index_str = tensor_name.substr(start_pos + 2);
+    for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+      if (std::isdigit(*itr) == 0) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("input '" + tensor_name +
+             "' does not follow <name>__<index> naming convention.")
+                .c_str());
+      }
+    }
+
+    // check if the data type is supported by PyTorch
+    if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("input '" + tensor_name + "' type '" + tensor_datatype +
+           "' is not supported by PyTorch.")
+              .c_str());
+    }
+
+    ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
+    input_index_map_[tensor_name] = ip_index;
+  }
+
+  return nullptr;  // success
+}
+
+
+}  // namespace triton::backend::pytorch
diff --git a/src/model_instance_state.hh b/src/model_instance_state.hh
new file mode 100644
index 0000000..b495510
--- /dev/null
+++ b/src/model_instance_state.hh
@@ -0,0 +1,178 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdint>
+#include <exception>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "libtorch_utils.h"
+#include "model_state.hh"
+#include "naming_convention.hh"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/common/nvtx.h"
+#include "triton/core/tritonbackend.h"
+
+
+namespace triton::backend::pytorch {
+
+//
+// ModelInstanceState
+//
+// State associated with a model instance. An object of this class is
+// created and associated with each TRITONBACKEND_ModelInstance.
+//
+class ModelInstanceState : public BackendModelInstance {
+ private:
+  ModelState* model_state_;
+
+  // The full path to the TorchScript model file.
+  std::string model_path_;
+
+  std::shared_ptr<torch::jit::script::Module> torch_model_;
+  torch::Device device_;
+
+  // Map from configuration name for an input to the index of
+  // that input in the model.
+  std::unordered_map<std::string, int> input_index_map_;
+  uint32_t batch_input_count_ = 0;
+
+  // Map from configuration name for an output to the index of
+  // that output in the model.
+  std::unordered_map<std::string, int> output_index_map_;
+  std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
+
+  // If the input to the tensor is a dictionary of tensors.
+  bool is_dict_input_;
+
+  // If the model supports batching.
+  bool supports_batching_;
+
+  cudaEvent_t compute_input_start_event_;
+  cudaEvent_t compute_infer_start_event_;
+  cudaEvent_t compute_output_start_event_;
+
+  // Store the cuda streams created for the 'KIND_MODEL' instance group.
+  std::vector<cudaStream_t> stream_vec_;
+
+  // The number of available devices.
+  int device_cnt_;
+
+ public:
+  virtual ~ModelInstanceState();
+
+  // Clear CUDA cache
+  void ClearCache();
+
+  static TRITONSERVER_Error* Create(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance,
+      ModelInstanceState** state);
+
+  // Execute...
+  void ProcessRequests(
+      TRITONBACKEND_Request** requests, const uint32_t request_count);
+
+  // Get the state of the model that corresponds to this instance.
+  ModelState* StateForModel() const;
+
+ private:
+  ModelInstanceState(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance);
+
+  void AddInputToMap(
+      NamingConvention naming_convention,
+      const std::vector<std::string> allowed_inputs, const std::string& io_name,
+      const uint32_t index);
+
+  // Create CUDA events for statistics collection.
+  void CreateCudaEvents(const int32_t& device_id);
+
+  void Execute(
+      std::vector<TRITONBACKEND_Response*>* responses,
+      const uint32_t response_count,
+      std::vector<torch::jit::IValue>* input_tensors,
+      std::vector<torch::jit::IValue>* output_tensors);
+
+  // Get the elapsed time between two CUDA events.
+  float GetCudaEventElapsedTime(
+      const cudaEvent_t& start_event, const cudaEvent_t& end_event);
+
+  // Get the appropriate CUDA stream for input and output handling based on
+  // the instance group type.
+  cudaStream_t GetCudaStreamByInstanceKind();
+
+  // Get the naming convention for inputs/outputs from the model configuration
+  TRITONSERVER_Error* GetNamingConvention(
+      NamingConvention* naming_convention,
+      const std::vector<std::string>& allowed_io);
+
+  TRITONSERVER_Error* ReadOutputTensors(
+      size_t total_batch_size,
+      const std::vector<torch::jit::IValue>& output_tensors,
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses);
+
+  TRITONSERVER_Error* RecordBackendTimestamp(
+      uint64_t* timestamp, void* cuda_event);
+
+  // Replace the default CUDA stream with the stream we created to ensure
+  // proper cuda stream synchronization.
+  void SetCurrentCudaStream(
+      const cudaStream_t& stream, const int32_t& device_id);
+
+  TRITONSERVER_Error* SetInputTensors(
+      size_t total_batch_size, TRITONBACKEND_Request** requests,
+      const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses,
+      BackendInputCollector* collector, std::vector<const char*>* input_names,
+      std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy);
+
+  TRITONSERVER_Error* ValidateBooleanSequenceControl(
+      triton::common::TritonJson::Value& sequence_batching,
+      const std::string& control_kind, bool required, bool* have_control);
+
+  TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt);
+
+  TRITONSERVER_Error* ValidateOutputs();
+
+  TRITONSERVER_Error* ValidateTypedSequenceControl(
+      triton::common::TritonJson::Value& sequence_batching,
+      const std::string& control_kind, bool required, bool* have_control);
+};
+
+}  // namespace triton::backend::pytorch
diff --git a/src/model_state.cc b/src/model_state.cc
new file mode 100644
index 0000000..b007438
--- /dev/null
+++ b/src/model_state.cc
@@ -0,0 +1,495 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "model_state.hh"
+
+#include <mutex>
+
+
+namespace {
+std::once_flag pytorch_interop_threads_flag;
+std::once_flag pytorch_intraop_threads_flag;
+}  // namespace
+
+namespace triton::backend::pytorch {
+
+ModelState::ModelState(TRITONBACKEND_Model* triton_model)
+    : BackendModel(triton_model), enable_optimized_execution_(true),
+      enable_inference_mode_(true), enable_cudnn_(true),
+      enable_cache_cleaning_(false), enable_weight_sharing_(false),
+      enable_tensor_fuser_pair_({false, true}),
+      enable_jit_profiling_pair_({false, true}),
+      enable_jit_executor_pair_({false, true})
+{
+}
+
+TRITONSERVER_Error*
+ModelState::AutoCompleteConfig()
+{
+  // Auto-complete configuration is not supported since PyTorch does not
+  // store/capture sufficient model metadata so just log error instead.
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_WARN,
+      (std::string("skipping model configuration auto-complete for '") +
+       Name() + "': not supported for pytorch backend")
+          .c_str());
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+{
+  try {
+    *state = new ModelState(triton_model);
+  }
+  catch (const BackendModelException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  // Auto-complete the configuration if requested...
+  bool auto_complete_config = false;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(
+      triton_model, &auto_complete_config));
+  if (auto_complete_config) {
+    RETURN_IF_ERROR((*state)->AutoCompleteConfig());
+    RETURN_IF_ERROR((*state)->SetModelConfig());
+  }
+
+  auto& model_outputs = (*state)->model_outputs_;
+  // Parse the output states in the model configuration
+  triton::common::TritonJson::Value sequence_batching;
+  if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) {
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      for (size_t i = 0; i < states.ArraySize(); i++) {
+        triton::common::TritonJson::Value state;
+        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+        std::string output_state_name;
+        RETURN_IF_ERROR(
+            state.MemberAsString("output_name", &output_state_name));
+        auto it = model_outputs.find(output_state_name);
+        if (it == model_outputs.end()) {
+          model_outputs.insert({output_state_name, std::make_pair(-1, i)});
+        } else {
+          it->second.second = i;
+        }
+      }
+    }
+  }
+
+  // Parse the output names in the model configuration
+  triton::common::TritonJson::Value outputs;
+  RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs));
+  for (size_t i = 0; i < outputs.ArraySize(); i++) {
+    triton::common::TritonJson::Value output;
+    THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output));
+
+    // Use names from ModelConfig by reference since the model
+    // config will persist longer than this inference execution.
+    std::string output_name;
+    THROW_IF_BACKEND_INSTANCE_ERROR(
+        output.MemberAsString("name", &output_name));
+
+    auto it = model_outputs.find(output_name);
+    if (it == model_outputs.end()) {
+      model_outputs.insert({output_name, std::make_pair(i, -1)});
+    } else {
+      it->second.first = i;
+    }
+  }
+
+  RETURN_IF_ERROR((*state)->ParseParameters());
+
+  return nullptr;  // success
+}
+
+bool
+ModelState::EnabledCacheCleaning()
+{
+  return enable_cache_cleaning_;
+}
+
+bool
+ModelState::EnabledCudnn()
+{
+  return enable_cudnn_;
+}
+
+bool
+ModelState::EnabledInferenceMode()
+{
+  return enable_inference_mode_;
+}
+
+const std::pair<bool, bool>&
+ModelState::EnabledJitExecutor() const
+{
+  return enable_jit_executor_pair_;
+}
+
+const std::pair<bool, bool>&
+ModelState::EnabledJitProfiling() const
+{
+  return enable_jit_profiling_pair_;
+}
+
+bool
+ModelState::EnabledOptimizedExecution()
+{
+  return enable_optimized_execution_;
+}
+
+const std::pair<bool, bool>&
+ModelState::EnabledTensorExprFuser() const
+{
+  return enable_tensor_fuser_pair_;
+}
+
+bool
+ModelState::EnabledWeightSharing()
+{
+  return enable_weight_sharing_;
+}
+
+TRITONSERVER_Error*
+ModelState::LoadModel(
+    const std::string& artifact_name, const torch::Device device,
+    std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
+    std::shared_ptr<torch::jit::script::Module>* torch_model)
+{
+  // Find the TorchScript file that describes the model. If the model
+  // configuration doesn't have an explicit model file specified then
+  // use the default name ("model.pt").
+  std::string cc_model_filename = artifact_name;
+  if (cc_model_filename.empty()) {
+    cc_model_filename = "model.pt";
+  }
+
+  *model_path = JoinPath(
+      {RepositoryPath(), std::to_string(Version()), cc_model_filename});
+
+  {
+    bool exists;
+    RETURN_IF_ERROR(FileExists(*model_path, &exists));
+    RETURN_ERROR_IF_FALSE(
+        exists, TRITONSERVER_ERROR_UNAVAILABLE,
+        std::string("unable to find '") + *model_path +
+            "' for model instance '" + Name() + "'");
+  }
+
+  // If weight sharing is enabled, skip loading model if
+  // it is already available on the target device
+  std::pair<bool, int> device_pair;
+  if (enable_weight_sharing_) {
+    device_pair = std::make_pair(!device.is_cpu(), device.index());
+    auto mit = torch_models_.find(device_pair);
+    if (mit != torch_models_.end()) {
+      *torch_model = mit->second;
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Reusing TorchScript model for instance '") + Name() +
+           "'")
+              .c_str());
+      return nullptr;  // success
+    }
+  }
+
+  // Serialize the torch model to string
+  std::string model_data_str;
+  RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str));
+
+  // InferenceMode should be used to guard all tensors operations including
+  // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html
+  torch::InferenceMode infer_guard(EnabledInferenceMode());
+
+  try {
+    std::istringstream model_stream(model_data_str);
+    if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+      // Load the model without selecting a device.
+      torch_model->reset(
+          new torch::jit::Module(torch::jit::load(model_stream)));
+    } else {
+      torch_model->reset(
+          new torch::jit::Module(torch::jit::load(model_stream, device)));
+    }
+  }
+  catch (const std::exception& ex) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        ("failed to load model '" + Name() + "': " + ex.what()).c_str());
+  }
+
+  if (enable_weight_sharing_) {
+    if (!((torch_models_.emplace(device_pair, *torch_model)).second)) {
+      std::string type = device.is_cpu() ? "CPU" : "GPU";
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_WARN,
+          (std::string("Model already found on target ") + type + " device " +
+           "(id " + std::to_string(device.index()) + ") for '" + Name() + "'")
+              .c_str());
+    }
+  }
+
+  return nullptr;  // success
+}
+
+const std::map<std::string, std::pair<int64_t, int64_t>>&
+ModelState::ModelOutputs()
+{
+  return model_outputs_;
+}
+
+TRITONSERVER_Error*
+ModelState::ParseParameters()
+{
+  triton::common::TritonJson::Value params;
+  bool status = model_config_.Find("parameters", &params);
+  if (status) {
+    // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no
+    // update is made to 'enable_optimized_execution_'.
+    bool disable_optimized_execution = false;
+    TRITONSERVER_Error* err = ParseParameter(
+        params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+    enable_optimized_execution_ = !disable_optimized_execution;
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("Optimized execution is ") +
+         (enable_optimized_execution_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
+    // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then
+    // no update is made to 'enable_cache_cleaning_'.
+    err = ParseParameter(
+        params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("Cache Cleaning is ") +
+         (enable_cache_cleaning_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
+    // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made
+    // to 'enable_inference_mode_'.
+    err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("Inference Mode is ") +
+         (enable_inference_mode_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
+    // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made
+    // to 'enable_cudnn_'.
+    bool disable_cudnn = false;
+    err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+    enable_cudnn_ = !disable_cudnn;
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
+    // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no
+    // update is made to 'enable_tensor_fuser'.
+    bool enable_tensor_fuser = false;
+    err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      enable_tensor_fuser_pair_ = {true, enable_tensor_fuser};
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Tensor fuser is ") +
+           (enable_tensor_fuser ? "enabled" : "disabled") +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
+
+    // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no
+    // update is made to 'enable_weight_sharing'.
+    err = ParseParameter(
+        params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Weight sharing is ") +
+           (enable_weight_sharing_ ? "enabled" : "disabled") +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
+
+    // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update
+    // is made to 'enable_jit_profiling'.
+    bool enable_jit_profiling = false;
+    err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      enable_jit_profiling_pair_ = {true, enable_jit_profiling};
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Jit profiling is ") +
+           (enable_jit_profiling ? "enabled" : "disabled") +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
+
+    // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is
+    // made to 'enable_jit_executor'.
+    bool enable_jit_executor = false;
+    err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      enable_jit_executor_pair_ = {true, enable_jit_executor};
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Jit executor is ") +
+           (enable_jit_executor ? "enabled" : "disabled") +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
+
+    // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update
+    // is made to 'intra_op_thread_count', which by default will take all
+    // threads
+    int intra_op_thread_count = -1;
+    err =
+        ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      if (intra_op_thread_count > 0) {
+        // at::set_num_threads() does not throw if called more than once, but
+        // issues warnings. std::call_once() is useful to limit these.
+        std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() {
+          at::set_num_threads(intra_op_thread_count);
+        });
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("Intra op thread count is set to ") +
+             std::to_string(at::get_num_threads()) + " for model instance '" +
+             Name() + "'")
+                .c_str());
+      }
+    }
+
+    // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update
+    // is made to 'inter_op_thread_count', which by default will take all
+    // threads
+    int inter_op_thread_count = -1;
+    err =
+        ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      if (inter_op_thread_count > 0) {
+        // at::set_num_interop_threads() throws if called more than once.
+        // std::call_once() should prevent this, but try/catch is additionally
+        // used for safety.
+        std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() {
+          try {
+            at::set_num_interop_threads(inter_op_thread_count);
+          }
+          catch (const c10::Error& e) {
+            // do nothing
+          }
+        });
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("Inter op thread count is set to ") +
+             std::to_string(at::get_num_interop_threads()) +
+             " for model instance '" + Name() + "'")
+                .c_str());
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+}  // namespace triton::backend::pytorch
diff --git a/src/model_state.hh b/src/model_state.hh
new file mode 100644
index 0000000..1a404b8
--- /dev/null
+++ b/src/model_state.hh
@@ -0,0 +1,131 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdint>
+#include <exception>
+#include <mutex>
+
+#include "libtorch_utils.h"
+#include "naming_convention.hh"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/common/nvtx.h"
+#include "triton/core/tritonbackend.h"
+
+// for thread control
+// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api
+// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
+#include <ATen/Parallel.h>
+
+
+namespace triton::backend::pytorch {
+
+class ModelState : public triton::backend::BackendModel {
+ private:
+  // Flag to indicate whether optimized execution is enabled. Defaults to true.
+  bool enable_optimized_execution_;
+
+  // Flag to indicate whether inference mode is enabled. Defaults to false.
+  bool enable_inference_mode_;
+
+  // Flag to indicate whether cudnn is enabled. Defaults to true.
+  bool enable_cudnn_;
+
+  // Flag to indicate whether cache cleaning after each run is enabled.
+  // Defaults to false.
+  bool enable_cache_cleaning_;
+
+  // Flag to indicate whether weight sharing is enabled. Defaults to false.
+  bool enable_weight_sharing_;
+
+  // Flag pairs to indicate if various JIT settings are set and
+  // enabled respectively. Defaults to (false, true). Default behavior
+  // is to do nothing if not explicitly set.
+  std::pair<bool, bool> enable_tensor_fuser_pair_;
+  std::pair<bool, bool> enable_jit_profiling_pair_;
+  std::pair<bool, bool> enable_jit_executor_pair_;
+
+  // Model mapping for shared TorchScript model across all instances on the
+  // same device. The key is a pair of isGPU and device index.
+  std::map<
+      std::pair<bool, int64_t>, std::shared_ptr<torch::jit::script::Module>>
+      torch_models_;
+
+  // model_outputs is a map that contains unique outputs that the model must
+  // provide. The first pair is the model output index and the second is
+  // the index in the model state, -1 is used if one is not required.
+  // In the model configuration, the output in the state configuration
+  // can have intersection with the outputs section of the model. If an output
+  // is specified both in the output section and state section, it indicates
+  // that the backend must return the output state to the client too.
+  std::map<std::string, std::pair<int64_t, int64_t>> model_outputs_;
+
+ public:
+  virtual ~ModelState() = default;
+
+  static TRITONSERVER_Error* Create(
+      TRITONBACKEND_Model* triton_model, ModelState** state);
+
+  bool EnabledCacheCleaning();
+
+  bool EnabledCudnn();
+
+  bool EnabledInferenceMode();
+
+  const std::pair<bool, bool>& EnabledJitExecutor() const;
+
+  const std::pair<bool, bool>& EnabledJitProfiling() const;
+
+  bool EnabledOptimizedExecution();
+
+  const std::pair<bool, bool>& EnabledTensorExprFuser() const;
+
+  bool EnabledWeightSharing();
+
+  TRITONSERVER_Error* LoadModel(
+      const std::string& artifact_name, const torch::Device device,
+      std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
+      std::shared_ptr<torch::jit::script::Module>* torch_model);
+
+  const std::map<std::string, std::pair<int64_t, int64_t>>& ModelOutputs();
+
+ private:
+  ModelState(TRITONBACKEND_Model* triton_model);
+
+  TRITONSERVER_Error* AutoCompleteConfig();
+
+  TRITONSERVER_Error* ParseParameters();
+};
+
+}  // namespace triton::backend::pytorch
diff --git a/src/naming_convention.hh b/src/naming_convention.hh
new file mode 100644
index 0000000..756cba4
--- /dev/null
+++ b/src/naming_convention.hh
@@ -0,0 +1,40 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+
+namespace triton::backend::pytorch {
+
+// The naming convention followed for inputs/outputs in the model configuration.
+// Outputs don't support FORWARD_ARGUMENT.
+enum class NamingConvention {
+  NAMED_INDEX,
+  FORWARD_ARGUMENT,
+  STRICT_CONFIG_ORDERING
+};
+
+}  // namespace triton::backend::pytorch
diff --git a/src/string_utils.cc b/src/string_utils.cc
new file mode 100644
index 0000000..a605c7c
--- /dev/null
+++ b/src/string_utils.cc
@@ -0,0 +1,254 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "string_utils.hh"
+
+
+namespace triton::backend::pytorch {
+
+// This function will return a tensor's contents as a contiguous
+// chunk in system memory. In some cases this will require copying the data.
+// If that  happens, 'contiguous_buffer' will be set to hold the contiguous
+// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
+// conducted.  The data copy can be avoided if the input is already in
+// a contiguous chunk and the input is located in memory type and id
+// specified.
+TRITONSERVER_Error*
+GetContiguousInputContent(
+    TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
+    const char** content, size_t* content_byte_size,
+    std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy)
+{
+  *cuda_copy = false;
+
+  // Check input buffers to see if data copy is necessary
+  size_t chunk_count = 0;
+  bool type_mismatch = false;
+  uint64_t total_byte_size = 0;
+  for (size_t idx = 0; idx < buffer_count; ++idx) {
+    TRITONSERVER_MemoryType src_memory_type;
+    int64_t src_memory_type_id;
+    size_t src_byte_size;
+    const void* src_ptr;
+
+    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+        rinput, idx, &src_ptr, &src_byte_size, &src_memory_type,
+        &src_memory_type_id));
+
+    if (src_ptr != nullptr) {
+      chunk_count++;
+      total_byte_size += src_byte_size;
+      type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU);
+    }
+  }
+
+  if (chunk_count == 0) {
+    *content = nullptr;
+    *content_byte_size = 0;
+  } else if ((chunk_count == 1) && !type_mismatch) {
+    TRITONSERVER_MemoryType src_memory_type;
+    int64_t src_memory_type_id;
+    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+        rinput, 0, (const void**)content, content_byte_size, &src_memory_type,
+        &src_memory_type_id));
+  } else {
+    contiguous_buffer->resize(total_byte_size);
+
+    size_t offset = 0;
+    for (size_t i = 0; i < chunk_count; i++) {
+      bool cuda_used;
+      TRITONSERVER_MemoryType src_memory_type;
+      int64_t src_memory_type_id;
+      size_t src_byte_size;
+      const void* src_ptr;
+
+      RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+          rinput, i, &src_ptr, &src_byte_size, &src_memory_type,
+          &src_memory_type_id));
+      RETURN_IF_ERROR(CopyBuffer(
+          "Contiguous input", src_memory_type, src_memory_type_id,
+          TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr,
+          contiguous_buffer->data() + offset, stream, &cuda_used));
+      *cuda_copy |= cuda_used;
+      offset += src_byte_size;
+    }
+
+    *content = contiguous_buffer->data();
+    *content_byte_size = total_byte_size;
+  }
+
+  return nullptr;  // success
+}
+
+void
+FillStringTensor(torch::List<std::string>* input_list, const size_t cnt)
+{
+  for (size_t c = 0; c < cnt; ++c) {
+    input_list->push_back("");
+  }
+}
+
+bool
+SetStringBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
+    const size_t tensor_element_count, cudaStream_t stream,
+    std::string* serialized, bool state)
+{
+  bool cuda_copy = false;
+
+  // Serialize the output tensor strings. Each string is serialized as
+  // a 4-byte length followed by the string itself with no
+  // null-terminator.
+  serialized->clear();
+  for (size_t e = 0; e < tensor_element_count; ++e) {
+    std::string str = tensor->get(e).to<std::string>();
+    const char* cstr = str.c_str();
+    size_t len = str.length();
+    serialized->append(reinterpret_cast<const char*>(&len), sizeof(uint32_t));
+    if (len > 0) {
+      serialized->append(cstr, len);
+    }
+  }
+
+  // Allocate a buffer large enough to hold the serialized tensor.
+  TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
+  int64_t actual_memory_type_id = 0;
+
+  TRITONSERVER_Error* err;
+  void* buffer;
+
+  if (!state) {
+    auto err = TRITONBACKEND_OutputBuffer(
+        response_output, &buffer, serialized->size(), &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  } else {
+    auto err = TRITONBACKEND_StateBuffer(
+        response_state, &buffer, serialized->size(), &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  }
+  // Copy the serialized tensor into the allocated buffer.
+  bool cuda_used = false;
+  err = CopyBuffer(
+      "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */,
+      0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id,
+      serialized->size(), reinterpret_cast<const void*>(serialized->c_str()),
+      buffer, stream, &cuda_used);
+  cuda_copy |= cuda_used;
+
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    return cuda_copy;
+  }
+
+  if (state) {
+    RESPOND_AND_SET_NULL_IF_ERROR(
+        response, TRITONBACKEND_StateUpdate(response_state));
+  }
+
+  return cuda_copy;
+}
+
+bool
+SetStringInputTensor(
+    torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
+    const char* name, const uint32_t buffer_count,
+    const size_t request_element_cnt, TRITONBACKEND_Response** response,
+    cudaStream_t stream, const char* host_policy_name)
+{
+  bool cuda_copy = false;
+
+  // For string data type, we always need to have the data on CPU so
+  // that we can read string length and construct the string
+  // properly. So if the request's input tensor is not in CPU need to
+  // copy it there.
+  const char* content = nullptr;
+  size_t content_byte_size = 0;
+
+  std::vector<char> contiguous_buffer;
+  auto err = GetContiguousInputContent(
+      input, buffer_count, &content, &content_byte_size, &contiguous_buffer,
+      stream, &cuda_copy);
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    FillStringTensor(input_list, request_element_cnt);
+    return cuda_copy;
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(stream);
+    cuda_copy = false;
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  std::vector<std::pair<const char*, const uint32_t>> str_list;
+  err = ValidateStringBuffer(
+      content, content_byte_size, request_element_cnt, name, &str_list);
+  // Set string values.
+  for (const auto& [addr, len] : str_list) {
+    input_list->push_back(std::string(addr, len));
+  }
+
+  size_t element_cnt = str_list.size();
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    FillStringTensor(input_list, request_element_cnt - element_cnt);
+  }
+  return cuda_copy;
+}
+
+bool
+SetStringOutputBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized)
+{
+  return SetStringBuffer(
+      tensor, response, response_output, nullptr /* response_state */,
+      tensor_element_count, stream, serialized, false /* state */);
+}
+
+bool
+SetStringStateBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_State* response_state, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized)
+{
+  return SetStringBuffer(
+      tensor, response, nullptr /* response_output */, response_state,
+      tensor_element_count, stream, serialized, true /* state */);
+}
+
+}  // namespace triton::backend::pytorch
diff --git a/src/string_utils.hh b/src/string_utils.hh
new file mode 100644
index 0000000..8373478
--- /dev/null
+++ b/src/string_utils.hh
@@ -0,0 +1,106 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdint>
+#include <exception>
+#include <mutex>
+
+#include "libtorch_utils.h"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/common/nvtx.h"
+#include "triton/core/tritonbackend.h"
+
+#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
+// Suppress warnings in torch headers
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma warning(push, 0)
+#include <torchvision/ops/ops.h>
+#include <torchvision/vision.h>  // Torchvision header
+#pragma warning(pop)
+#pragma GCC diagnostic pop
+#endif  // TRITON_PYTORCH_ENABLE_TORCHVISION
+
+#ifdef TRITON_ENABLE_GPU
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+// for thread control
+// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api
+// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
+#include <ATen/Parallel.h>
+
+
+namespace triton::backend::pytorch {
+
+void FillStringTensor(torch::List<std::string>* input_list, const size_t cnt);
+
+// This function will return a tensor's contents as a contiguous
+// chunk in system memory. In some cases this will require copying the data.
+// If that  happens, 'contiguous_buffer' will be set to hold the contiguous
+// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
+// conducted.  The data copy can be avoided if the input is already in
+// a contiguous chunk and the input is located in memory type and id
+// specified.
+TRITONSERVER_Error* GetContiguousInputContent(
+    TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
+    const char** content, size_t* content_byte_size,
+    std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy);
+
+bool SetStringBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
+    const size_t tensor_element_count, cudaStream_t stream,
+    std::string* serialized, bool state);
+
+bool SetStringInputTensor(
+    torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
+    const char* name, const uint32_t buffer_count,
+    const size_t request_element_cnt, TRITONBACKEND_Response** response,
+    cudaStream_t stream, const char* host_policy_name);
+
+bool SetStringOutputBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized);
+
+bool SetStringStateBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_State* response_state, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized);
+
+}  // namespace triton::backend::pytorch

From 7e9d0f9d29f5c87f36c5c5fd22d8bdd5e242c1b2 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Wed, 5 Nov 2025 10:57:33 -0800
Subject: [PATCH 76/76] build(fix): Update header file reference (#170)

* Update header file reference

* fix: address pre-commit issue

* Update header name
---
 src/libtorch.hh             | 2 +-
 src/model_instance_state.cc | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/libtorch.hh b/src/libtorch.hh
index 263c340..4bd4700 100644
--- a/src/libtorch.hh
+++ b/src/libtorch.hh
@@ -27,7 +27,7 @@
 #include "model_instance_state.hh"
 #include "model_state.hh"
 #include "naming_convention.hh"
-#include "string_utilities.hh"
+#include "string_utils.hh"
 
 //
 // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
diff --git a/src/model_instance_state.cc b/src/model_instance_state.cc
index 7cd5ee3..19cae27 100644
--- a/src/model_instance_state.cc
+++ b/src/model_instance_state.cc
@@ -25,7 +25,8 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "model_instance_state.hh"
-#include "string_utilities.hh"
+
+#include "string_utils.hh"
 
 #ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
 // Suppress warnings in torch headers