diff --git a/.clang-format b/.clang-format
index 98c6497..1defc17 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,6 +2,7 @@
 BasedOnStyle: Google
 
 IndentWidth: 2
+ColumnLimit: 80
 ContinuationIndentWidth: 4
 UseTab: Never
 MaxEmptyLinesToKeep: 2
@@ -34,4 +35,5 @@ BinPackArguments: true
 BinPackParameters: true
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 
-IndentCaseLabels: true
\ No newline at end of file
+IndentCaseLabels: true
+
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000..4fa1873
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,38 @@
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: pre-commit
+
+on:
+  pull_request:
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v5.0.0
+    - uses: actions/setup-python@v6.0.0
+    - uses: pre-commit/action@v3.0.1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..3c76a6e
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,73 @@
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+repos:
+- repo: https://github.com/PyCQA/isort
+  rev: 5.12.0
+  hooks:
+  - id: isort
+    additional_dependencies: [toml]
+- repo: https://github.com/psf/black
+  rev: 23.1.0
+  hooks:
+  - id: black
+    types_or: [python, cython]
+- repo: https://github.com/PyCQA/flake8
+  rev: 7.3.0
+  hooks:
+  - id: flake8
+    args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
+    types_or: [python, cython]
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v16.0.5
+  hooks:
+  - id: clang-format
+    types_or: [c, c++, cuda, proto, textproto, java]
+    args: ["-fallback-style=none", "-style=file", "-i"]
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.2.4
+  hooks:
+  - id: codespell
+    additional_dependencies: [tomli]
+    args: ["--toml", "pyproject.toml"]
+    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
+# More details about these pre-commit hooks here:
+# https://pre-commit.com/hooks.html
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v6.0.0
+  hooks:
+  - id: check-case-conflict
+  - id: check-executables-have-shebangs
+  - id: check-merge-conflict
+  - id: check-json
+  - id: check-toml
+  - id: check-yaml
+  - id: check-shebang-scripts-are-executable
+  - id: end-of-file-fixer
+    types_or: [c, c++, cuda, proto, textproto, java, python]
+  - id: mixed-line-ending
+  - id: requirements-txt-fixer
+  - id: trailing-whitespace
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 076b095..5b0e399 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,10 +24,13 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cmake_minimum_required (VERSION 3.18)
+cmake_minimum_required (VERSION 3.31.8)
 
 project(tritonpytorchbackend LANGUAGES C CXX)
 
+# Use C++17 standard as Triton's minimum required.
+set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.")
+
 #
 # Options
 #
@@ -44,13 +47,16 @@ project(tritonpytorchbackend LANGUAGES C CXX)
 
 option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
 option(TRITON_PYTORCH_ENABLE_TORCHTRT "Enable TorchTRT support" OFF)
 option(TRITON_PYTORCH_ENABLE_TORCHVISION "Enable Torchvision support" ON)
+option(TRITON_PYTORCH_NVSHMEM "Enable NVSHMEM support" ON)
 
 set(TRITON_PYTORCH_DOCKER_IMAGE "" CACHE STRING "Docker image containing the PyTorch build required by backend.")
 set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes")
 set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries")
 
+set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from")
 set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
 set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
 set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
@@ -73,37 +79,54 @@ else()
   endif()
 
     # Look for installed Torchvision package in lib paths
-  if(TRITON_PYTORCH_ENABLE_TORCHVISION AND NOT EXISTS "${TRITON_PYTORCH_LIB_PATHS}/libtorchvision.so")
+  find_library( LIBTORCHVISION libtorchvision.so libtorchvision.so.1 PATHS ${TRITON_PYTORCH_LIB_PATHS} )
+  if(NOT ${LIBTORCHVISION})
     message(WARNING "TRITON_PYTORCH_ENABLE_TORCHVISION is on, but TRITON_PYTORCH_LIB_PATHS does not contain Torchvision package")
-  endif()
+  endif(NOT ${LIBTORCHVISION})
 endif()
 
 # Python.h needed by torch headers.
-find_package(Python3 REQUIRED COMPONENTS Development)
+find_package(Python3 REQUIRED COMPONENTS Development.Module)
+
+set(RHEL_BUILD OFF)
+set(LIB_DIR "lib")
+set(LIBTORCH_LIBS_PATH "/usr/local/lib")
+set(PY_INSTALL_PATH "/usr/local/lib/python3.12/dist-packages")
+if(LINUX)
+  file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
+  if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+    set(RHEL_BUILD ON)
+    set(LIB_DIR "lib64")
+    set(PY_INSTALL_PATH "/opt/_internal/cpython-3.12.1/lib/python3.12/site-packages")
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+      set(LIBTORCH_LIBS_PATH "/opt/_internal/cpython-3.12.1/lib")
+    endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+  endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+endif(LINUX)
 
 #
 # Dependencies
 #
-# FetchContent's composibility isn't very good. We must include the
+# FetchContent's composability isn't very good. We must include the
 # transitive closure of all repos so that we can override the tag.
 #
 include(FetchContent)
 
 FetchContent_Declare(
   repo-common
-  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git
   GIT_TAG ${TRITON_COMMON_REPO_TAG}
   GIT_SHALLOW ON
 )
 FetchContent_Declare(
   repo-core
-  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git
   GIT_TAG ${TRITON_CORE_REPO_TAG}
   GIT_SHALLOW ON
 )
 FetchContent_Declare(
   repo-backend
-  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git
   GIT_TAG ${TRITON_BACKEND_REPO_TAG}
   GIT_SHALLOW ON
 )
@@ -120,66 +143,93 @@ else()
   endif()
 endif() # TRITON_ENABLE_GPU
 
+if(${TRITON_ENABLE_NVTX})
+  add_definitions(-DTRITON_ENABLE_NVTX=1)
+endif() # TRITON_ENABLE_NVTX
+
 #
 # Shared library implementing the Triton Backend API
 #
 configure_file(src/libtriton_pytorch.ldscript libtriton_pytorch.ldscript COPYONLY)
 
-if (${TRITON_PYTORCH_DOCKER_BUILD})
-  if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
-    set(LIBS_ARCH "aarch64")
-    set(CONDA_LIBS
-        "libopenblas.so.0"
-    )
-  else()
-    set(LIBS_ARCH "x86_64")
-    set(CONDA_LIBS
-        "libmkl_core.so"
-        "libmkl_gnu_thread.so"
-        "libmkl_intel_lp64.so"
-        "libmkl_intel_thread.so"
-        "libmkl_def.so"
-        "libmkl_vml_def.so"
-        "libmkl_rt.so"
-        "libmkl_avx2.so"
-        "libmkl_avx512.so"
-        "libmkl_sequential.so"
-        "libomp.so"
-    )
-  endif()
+set(PT_LIBS
+    "libc10.so"
+    "libc10_cuda.so"
+    "libtorch.so"
+    "libtorch_cpu.so"
+    "libtorch_cuda.so"
+    "libtorch_cuda_linalg.so"
+    "libtorch_global_deps.so"
+    "libjpeg.so.62"
+)
+
+if (${TRITON_PYTORCH_NVSHMEM})
   set(PT_LIBS
-      ${CONDA_LIBS}
-      "libc10.so"
-      "libc10_cuda.so"
-      "libtorch.so"
-      "libtorch_cpu.so"
-      "libtorch_cuda.so"
-      "libtorch_global_deps.so"
-      "libtorchvision.so"
+      ${PT_LIBS}
+      "libtorch_nvshmem.so"
   )
-  set(OPENCV_LIBS
-      "libopencv_video.so"
-      "libopencv_videoio.so"
-      "libopencv_highgui.so"
-      "libopencv_imgcodecs.so"
-      "libopencv_imgproc.so"
-      "libopencv_core.so"
-      "libpng16.so"
+endif() # TRITON_PYTORCH_NVSHMEM
+
+if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
+  set(PT_LIBS
+      ${PT_LIBS}
+      $<IF:$<BOOL:${RHEL_BUILD}>,libtorchvision.so,libtorchvision.so.1>
   )
+endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
 
-  string(REPLACE ";" " " CONDA_LIBS_STR "${CONDA_LIBS}")
+if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
+  set(PT_LIBS
+      ${PT_LIBS}
+      "libtorchtrt_runtime.so"
+  )
+endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
+
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(LIBS_ARCH "aarch64")
+  set(LIBTORCH_LIBS
+      "libnvpl_blas_core.so.0"
+      "libnvpl_blas_ilp64_gomp.so.0"
+      "libnvpl_blas_ilp64_seq.so.0"
+      "libnvpl_blas_lp64_gomp.so.0"
+      "libnvpl_blas_lp64_seq.so.0"
+      "libnvpl_lapack_core.so.0"
+      "libnvpl_lapack_ilp64_gomp.so.0"
+      "libnvpl_lapack_ilp64_seq.so.0"
+      "libnvpl_lapack_lp64_gomp.so.0"
+      "libnvpl_lapack_lp64_seq.so.0"
+  )
+else()
+  set(LIBS_ARCH "x86_64")
+  set(LIBTORCH_LIBS
+    "libmkl_avx2.so.1"
+    "libmkl_avx512.so.1"
+    "libmkl_core.so.1"
+    "libmkl_def.so.1"
+    "libmkl_gnu_thread.so.1"
+    "libmkl_intel_lp64.so.1"
+    "libmkl_intel_thread.so.1"
+    "libmkl_rt.so.1"
+    "libmkl_sequential.so.1"
+    "libmkl_vml_def.so.1"
+  )
+endif()
+set(TORCHVISION_LIBS
+    $<IF:$<BOOL:${RHEL_BUILD}>,libjpeg.so.62,libjpeg.so>
+    $<IF:$<BOOL:${RHEL_BUILD}>,libpng16.so.16,libpng16.so>
+)
 
-  if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
-    set(PT_LIBS
-        ${PT_LIBS}
-        "libtorchtrt_runtime.so"
-    )
-  endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
+# The patchelf commands ensure the MKL libraries are loaded correctly during runtime
+# Without these, the framework/backend complains of missing libraries / symbols and
+# in some cases leads to segmentation faults.
+if (${TRITON_PYTORCH_DOCKER_BUILD})
+  string(REPLACE ";" " " LIBTORCH_LIBS_STR "${LIBTORCH_LIBS}")
+  string(RANDOM 8 "abcdefghijklmnopqrstuvwxyz" random_id)
 
   add_custom_command(
     OUTPUT
       ${PT_LIBS}
-      ${OPENCV_LIBS}
+      ${LIBTORCH_LIBS}
+      ${TORCHVISION_LIBS}
       LICENSE.pytorch
       include/torch
       include/torchvision
@@ -187,38 +237,42 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE}
     COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true
     COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE}
-    COMMAND /bin/sh -c "for i in ${CONDA_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/opt/conda/lib/$i $i ; done"
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libc10.so libc10.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libc10_cuda.so libc10_cuda.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch.so libtorch.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
-    COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/build/libtorchvision.so libtorchvision.so
-    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
+    COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:${LIBTORCH_LIBS_PATH}/$i $i ; done"
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10.so libc10.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10_cuda.so libc10_cuda.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch.so libtorch.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cpu.so libtorch_cpu.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda.so libtorch_cuda.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_global_deps.so libtorch_global_deps.so
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_NVSHMEM} = 'ON' ]; then docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_nvshmem.so libtorch_nvshmem.so; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -a -L pytorch_backend_ptlib:/usr/local/lib64/libtorchvision.so libtorchvision.so; else docker cp -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi"
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true
     COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch
-    COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/include include/torch
-    COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/codegen
-    COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_videoio.so.3.4.11 libopencv_videoio.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_highgui.so.3.4.11 libopencv_highgui.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_video.so.3.4.11 libopencv_video.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgcodecs.so.3.4.11 libopencv_imgcodecs.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgproc.so.3.4.11 libopencv_imgproc.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_core.so.3.4.11 libopencv_core.so
-    COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0 libpng16.so
-    COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_def.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_def.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx2.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx2.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx512.so; fi"
-    COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx512.so; fi"
+    COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/include include/torch
+    COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/.
+
+    COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libjpeg.so.62 libjpeg.so.62; else docker cp -L pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62 && docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so; fi;"
+    COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libpng16.so.16 libpng16.so.16; else docker cp -L pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so libpng16.so; fi;"
+    COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx2.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx512.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx512.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_vml_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_intel_thread.so.1 libmkl_vml_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_vml_def.so.1; fi"
+    COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so.1 ]; then patchelf --add-needed libmkl_intel_lp64.so.1 libmkl_intel_thread.so.1; fi"
+    COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'OFF' ]; then ln -s libtorchvision.so.1 libtorchvision.so; fi; fi;"
     COMMAND docker rm pytorch_backend_ptlib
     COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}"
     VERBATIM
   )
-  add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${OPENCV_LIBS})
+  add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS})
   add_library(ptlib SHARED IMPORTED GLOBAL)
   add_dependencies(ptlib ptlib_target)
 
@@ -235,6 +289,9 @@ add_library(
   src/libtorch.cc
   src/libtorch_utils.cc
   src/libtorch_utils.h
+  src/model_instance_state.cc
+  src/model_state.cc
+  src/string_utils.cc
 )
 
 add_library(
@@ -266,7 +323,7 @@ endif() # TRITON_PYTORCH_DOCKER_BUILD
 # Need to turn off -Werror due to Torchvision vision.h extern initialization
 # Unfortunately gcc does not provide a specific flag to ignore the specific
 # warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=45977
-target_compile_features(triton-pytorch-backend PRIVATE cxx_std_11)
+target_compile_features(triton-pytorch-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})
 target_compile_options(
   triton-pytorch-backend PRIVATE
   $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
@@ -317,8 +374,8 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
 
   if (${TRITON_PYTORCH_ENABLE_TORCHVISION})
     set(TRITON_PYTORCH_LIBS
-        ${TRITON_PYTORCH_LIBS}
-        "${CMAKE_CURRENT_BINARY_DIR}/libtorchvision.so")
+    ${TRITON_PYTORCH_LIBS}
+    "${CMAKE_CURRENT_BINARY_DIR}/$<IF:$<BOOL:${RHEL_BUILD}>,libtorchvision.so,libtorchvision.so.1>")
   endif() # TRITON_PYTORCH_ENABLE_TORCHVISION
 
   if (${TRITON_PYTORCH_ENABLE_TORCHTRT})
@@ -384,7 +441,7 @@ install(
 
 if (${TRITON_PYTORCH_DOCKER_BUILD})
   set(PT_LIB_PATHS "")
-  FOREACH(plib ${PT_LIBS} ${OPENCV_LIBS})
+  FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS})
     set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}")
   ENDFOREACH(plib)
 
@@ -403,7 +460,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     )
   endif() # TRITON_PYTORCH_ENABLE_TORCHTRT
 
-  FOREACH(plib ${PT_LIBS} ${OPENCV_LIBS})
+  FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS})
     install(
       CODE
         "EXECUTE_PROCESS(
@@ -416,23 +473,40 @@ if (${TRITON_PYTORCH_DOCKER_BUILD})
     )
   ENDFOREACH(plib)
 
-  set(OPENCV_VERSION "3.4")
   install(
     CODE
       "EXECUTE_PROCESS(
-        COMMAND ln -sf libopencv_video.so libopencv_video.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_videoio.so libopencv_videoio.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_highgui.so libopencv_highgui.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_imgcodecs.so libopencv_imgcodecs.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_imgproc.so libopencv_imgproc.so.${OPENCV_VERSION}
-        COMMAND ln -sf libopencv_core.so libopencv_core.so.${OPENCV_VERSION}
         COMMAND ln -sf libpng16.so libpng16.so.16
+        COMMAND ln -sf libjpeg.so libjpeg.so.8
         RESULT_VARIABLE LINK_STATUS
         WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
       if(LINK_STATUS AND NOT LINK_STATUS EQUAL 0)
         message(FATAL_ERROR \"FAILED: to create links\")
       endif()"
   )
+else()
+  FOREACH(plib ${PT_LIBS})
+    set(PT_LIB_PATHS ${PT_LIB_PATHS} "${TRITON_PYTORCH_LIB_PATHS}/${plib}")
+  ENDFOREACH(plib)
+
+  install(
+    FILES
+      ${PT_LIB_PATHS}
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch
+  )
+
+  FOREACH(plib ${PT_LIBS})
+    install(
+      CODE
+        "EXECUTE_PROCESS(
+          COMMAND patchelf --set-rpath \$ORIGIN ${plib}
+          RESULT_VARIABLE PATCHELF_STATUS
+          WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch)
+        if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0)
+          message(FATAL_ERROR \"FAILED: to run patchelf\")
+        endif()"
+    )
+  ENDFOREACH(plib)
 endif() # TRITON_PYTORCH_DOCKER_BUILD
 
 install(
@@ -446,6 +520,13 @@ install(
     ${INSTALL_CONFIGDIR}
 )
 
+install(
+  FILES
+    src/model.py
+  DESTINATION
+    ${CMAKE_INSTALL_PREFIX}/backends/pytorch
+)
+
 include(CMakePackageConfigHelpers)
 configure_package_config_file(
   ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonPyTorchBackendConfig.cmake.in
diff --git a/README.md b/README.md
index 0eb8388..ccc803c 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,185 +26,398 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 -->
 
+# PyTorch (LibTorch) Backend
+
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
-# PyTorch (LibTorch) Backend
+The Triton backend for
+[PyTorch](https://github.com/pytorch/pytorch)
+is designed to run
+[TorchScript](https://pytorch.org/docs/stable/jit.html)
+models using the PyTorch C++ API.
+All models created in PyTorch using the python API must be traced/scripted to produce a TorchScript model.
 
-The Triton backend for [PyTorch](https://github.com/pytorch/pytorch).
-You can learn more about Triton backends in the [backend
-repo](https://github.com/triton-inference-server/backend). Ask
-questions or report problems on the [issues
-page](https://github.com/triton-inference-server/server/issues).
-This backend is designed to run [TorchScript](https://pytorch.org/docs/stable/jit.html)
-models using the PyTorch C++ API. All models created in PyTorch
-using the python API must be traced/scripted to produce a TorchScript
-model.
-
-Where can I ask general questions about Triton and Triton backends?
-Be sure to read all the information below as well as the [general
-Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server)
-available in the main [server](https://github.com/triton-inference-server/server)
-repo. If you don't find your answer there you can ask questions on the
-main Triton [issues page](https://github.com/triton-inference-server/server/issues).
+You can learn more about Triton backends in the
+[Triton Backend](https://github.com/triton-inference-server/backend)
+repository.
+
+Ask questions or report problems using
+[Triton Server issues](https://github.com/triton-inference-server/server/issues).
+
+Be sure to read all the information below as well as the
+[general Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server)
+available in the [Triton Server](https://github.com/triton-inference-server/server) repository.
 
 ## Build the PyTorch Backend
 
-Use a recent cmake to build. First install the required dependencies.
+Use a recent cmake to build.
+First install the required dependencies.
 
-```
-$ apt-get install patchelf rapidjson-dev python3-dev
+```bash
+apt-get install rapidjson-dev python3-dev python3-pip
+pip3 install patchelf==0.17.2
 ```
 
-An appropriate PyTorch container from [NGC](https://ngc.nvidia.com) must be used.
-For example, to build a backend that uses the 21.02 version of the PyTorch
-container from NGC:
+An appropriate PyTorch container from [NVIDIA NGC Catalog](https://ngc.nvidia.com) must be used.
+For example, to build a backend that uses the 23.04 version of the PyTorch container from NGC:
 
-```
-$ mkdir build
-$ cd build
-$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:21.02-py3" ..
-$ make install
+```bash
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:23.04-py3" ..
+make install
 ```
 
-The following required Triton repositories will be pulled and used in
-the build. By default the "main" branch/tag will be used for each repo
-but the listed CMake argument can be used to override.
+The following required Triton repositories will be pulled and used in the build.
+By default, the `main` head will be used for each repository but the listed CMake argument can be used to override the value.
 
-* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
-* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
-* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
+* triton-inference-server/backend: `-DTRITON_BACKEND_REPO_TAG=[tag]`
+* triton-inference-server/core: `-DTRITON_CORE_REPO_TAG=[tag]`
+* triton-inference-server/common: `-DTRITON_COMMON_REPO_TAG=[tag]`
 
 ## Build the PyTorch Backend With Custom PyTorch
 
-Currently, Triton requires that a specially patched version of
-PyTorch be used with the PyTorch backend. The full source for
-these PyTorch versions are available as Docker images from
-[NGC](https://ngc.nvidia.com). For example, the PyTorch version
-compatible with the 21.02 release of Triton is available as
-nvcr.io/nvidia/pytorch:21.02-py3.
+Currently, Triton requires that a specially patched version of PyTorch be used with the PyTorch backend.
+The full source for these PyTorch versions are available as Docker images from
+[NGC](https://ngc.nvidia.com).
+
+For example, the PyTorch version compatible with the 25.09 release of Triton is available as `nvcr.io/nvidia/pytorch:25.09-py3` which supports PyTorch version `2.9.0a0`.
+
+> [!NOTE]
+> Additional details and version information can be found in the container's
+> [release notes](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-09.html#rel-25-09).
 
-Copy over the LibTorch and Torchvision headers and libraries from the
+Copy over the LibTorch and TorchVision headers and libraries from the
 [PyTorch NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
-into local directories. You can see which headers and libraries
-are needed/copied from the docker.
+into local directories.
+You can see which headers and libraries are needed/copied from the docker.
+
+```bash
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="<PATH_PREFIX>/torch;<PATH_PREFIX>/torch/torch/csrc/api/include;<PATH_PREFIX>/torchvision" -DTRITON_PYTORCH_LIB_PATHS="<LIB_PATH_PREFIX>" ..
+make install
+```
+
+## Using the PyTorch Backend
 
+### PyTorch 2.0 Models
+
+PyTorch 2.0 features are available.
+However, Triton's PyTorch backend requires a serialized representation of the model in the form a `model.pt` file.
+The serialized representation of the model can be generated using PyTorch's
+[`torch.save()`](https://docs.pytorch.org/tutorials/beginner/saving_loading_models.html#id1)
+function to generate the `model.pt` file.
+
+The model repository should look like:
+
+```bash
+model_repository/
+`-- model_directory
+    |-- 1
+    |   `-- model.pt
+    `-- config.pbtxt
 ```
-$ mkdir build
-$ cd build
-$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="<PATH_PREFIX>/torch;<PATH_PREFIX>/torch/torch/csrc/api/include;<PATH_PREFIX>/torchvision" -DTRITON_PYTORCH_LIB_PATHS="<LIB_PATH_PREFIX>" ..
-$ make install
+
+Where `model.pt` is the serialized representation of the model.
+
+### TorchScript Models
+
+The model repository should look like:
+
+```bash
+model_repository/
+`-- model_directory
+    |-- 1
+    |   `-- model.pt
+    `-- config.pbtxt
 ```
 
-## Using the PyTorch Backend
+The `model.pt` is the TorchScript model file.
+
+## Configuration
+
+Triton exposes some flags to control the execution mode of the TorchScript models through the `Parameters` section of the model's `config.pbtxt` file.
+
+### Configuration Options
+
+* `default_model_name`:
+  Instructs the Triton PyTorch backend to load the model from a file of the given name.
+
+  The model config specifying the option would look like:
+
+  ```proto
+  default_model_name: "another_file_name.pt"
+  ```
 
 ### Parameters
 
-Triton exposes some flags to control the execution mode of the TorchScript models through
-the Parameters section of the model's 'config.pbtxt' file.
+* `DISABLE_OPTIMIZED_EXECUTION`:
+  Boolean flag to disable the optimized execution of TorchScript models.
+  By default, the optimized execution is always enabled.
 
-* `DISABLE_OPTIMIZED_EXECUTION`: Boolean flag to disable the optimized execution
-of TorchScript models. By default the optimized execuiton is always enabled.
+  The initial calls to a loaded TorchScript model take a significant amount of time.
+  Due to this longer model warmup
+  ([pytorch #57894](https://github.com/pytorch/pytorch/issues/57894)),
+  Triton also allows execution of models without these optimizations.
+  In some models, optimized execution does not benefit performance
+  ([pytorch #19978](https://github.com/pytorch/pytorch/issues/19978))
+  and in other cases impacts performance negatively
+  ([pytorch #53824](https://github.com/pytorch/pytorch/issues/53824)).
 
-The initial calls to a loaded TorchScript model take extremely long. Due to this longer
-model warmup [issue](https://github.com/pytorch/pytorch/issues/57894), Triton also allows
-execution of models without these optimizations. In some models, optimized execution
-does not benefit performance as seen [here](https://github.com/pytorch/pytorch/issues/19978)
-and in other cases impacts performance negatively, as seen [here](https://github.com/pytorch/pytorch/issues/53824).
+  The section of model config file specifying this parameter will look like:
 
-The section of model config file specifying this parameter will look like:
+  ```proto
+  parameters: {
+    key: "DISABLE_OPTIMIZED_EXECUTION"
+    value: { string_value: "true" }
+  }
+  ```
 
-```
-parameters: {
-key: "DISABLE_OPTIMIZED_EXECUTION"
-    value: {
-    string_value:"true"
-    }
-}
-```
+* `INFERENCE_MODE`:
 
-* `INFERENCE_MODE`: Boolean flag to enable the Inference Mode execution
-of TorchScript models. By default the inference mode is disabled.
+  Boolean flag to enable the Inference Mode execution of TorchScript models.
+  By default, the inference mode is enabled.
 
-[InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new
-RAII guard analogous to NoGradMode to be used when you are certain your operations
-will have no interactions with autograd. Compared to NoGradMode, code run under
-this mode gets better performance by disabling autograd.
+  [InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new RAII guard analogous to `NoGradMode` to be used when you are certain your operations will have no interactions with autograd.
+  Compared to `NoGradMode`, code run under this mode gets better performance by disabling autograd.
 
-Please note that in some models, InferenceMode might not benefit performance
-and in fewer cases might impact performance negatively.
+  Please note that in some models, InferenceMode might not benefit performance and in fewer cases might impact performance negatively.
 
-The section of model config file specifying this parameter will look like:
+  To enable inference mode, use the configuration example below:
 
-```
-parameters: {
-key: "INFERENCE_MODE"
-    value: {
-    string_value:"true"
-    }
-}
-```
+  ```proto
+  parameters: {
+    key: "INFERENCE_MODE"
+    value: { string_value: "true" }
+  }
+  ```
 
-* `ENABLE_NVFUSER`: Boolean flag to enable the NvFuser (CUDA Graph
-Fuser) optimization for TorchScript models. If not specified, the
-default pytorch fuser is used. If `ENABLE_NVFUSER` is specified, the
-`ENABLE_TENSOR_FUSER` configuration (see below) is ignored.
+* `DISABLE_CUDNN`:
 
-Please note that in some models generated using trace in old PyTorch versions might not work
-correctly with NvFuser. We recommend using scripting and a recent version of PyTorch
-to generate these models.
+  Boolean flag to disable the cuDNN library.
+  By default, cuDNN is enabled.
 
-The section of model config file specifying this parameter will look like:
+  [cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for deep neural networks.
+  It provides highly tuned implementations for standard routines.
 
-```
-parameters: {
-key: "ENABLE_NVFUSER"
-    value: {
-    string_value:"true"
-    }
-}
-```
+  Typically, models run with cuDNN enabled execute faster.
+  However there are some exceptions where using cuDNN can be slower, cause higher memory usage, or result in errors.
 
-* `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to
-share weights. This optimization should not be used with stateful models. If not specified,
-weight sharing is disabled.
+  To disable cuDNN, use the configuration example below:
 
-The section of model config file specifying this parameter will look like:
+  ```proto
+  parameters: {
+    key: "DISABLE_CUDNN"
+    value: { string_value: "true" }
+  }
+  ```
 
-```
-parameters: {
-key: "ENABLE_WEIGHT_SHARING"
-    value: {
-    string_value:"true"
-    }
-}
-```
+* `ENABLE_WEIGHT_SHARING`:
+
+  Boolean flag to enable model instances on the same device to share weights.
+  This optimization should not be used with stateful models.
+  If not specified, weight sharing is disabled.
+
+  To enable weight sharing, use the configuration example below:
+
+  ```proto
+  parameters: {
+    key: "ENABLE_WEIGHT_SHARING"
+    value: { string_value: "true" }
+  }
+  ```
+
+* `ENABLE_CACHE_CLEANING`:
+
+  Boolean flag to enable CUDA cache cleaning after each model execution.
+  If not specified, cache cleaning is disabled.
+  This flag has no effect if model is on CPU.
 
-* Additional Optimizations: Three additional boolean parameters are available to disable
-certain Torch optimizations that can sometimes cause latency regressions in models with
-complex execution modes and dynamic shapes. If not specified, all are enabled by default.
+  Setting this flag to true will likely negatively impact the performance due to additional CUDA cache cleaning operation after each model execution.
+  Therefore, you should only use this flag if you serve multiple models with Triton and encounter CUDA out-of-memory issues during model executions.
+
+  To enable cleaning of the CUDA cache after every execution, use the configuration example below:
+
+  ```proto
+  parameters: {
+    key: "ENABLE_CACHE_CLEANING"
+    value: { string_value: "true" }
+  }
+  ```
+
+* `INTER_OP_THREAD_COUNT`:
+
+  PyTorch allows using multiple CPU threads during TorchScript model inference.
+  One or more inference threads execute a model’s forward pass on the given inputs.
+  Each inference thread invokes a JIT interpreter that executes the ops of a model inline, one by one.
+
+  This parameter sets the size of this thread pool.
+  The default value of this setting is the number of cpu cores.
+
+  > [!TIP]
+  > Refer to
+  > [CPU Threading TorchScript](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
+  > on how to set this parameter properly.
+
+  To set the inter-op thread count, use the configuration example below:
+
+  ```proto
+  parameters: {
+    key: "INTER_OP_THREAD_COUNT"
+    value: { string_value: "1" }
+  }
+  ```
+
+> [!NOTE]
+> This parameter is set globally for the PyTorch backend.
+> The value from the first model config file that specifies this parameter will be used.
+> Subsequent values from other model config files, if different, will be ignored.
+
+* `INTRA_OP_THREAD_COUNT`:
+
+  In addition to the inter-op parallelism, PyTorch can also utilize multiple threads within the ops (intra-op parallelism).
+  This can be useful in many cases, including element-wise ops on large tensors, convolutions, GEMMs, embedding lookups and others.
+
+  The default value for this setting is the number of CPU cores.
+
+  > [!TIP]
+  > Refer to
+  > [CPU Threading TorchScript](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html)
+  > on how to set this parameter properly.
+
+  To set the intra-op thread count, use the configuration example below:
+
+  ```proto
+  parameters: {
+    key: "INTRA_OP_THREAD_COUNT"
+    value: { string_value: "1" }
+  }
+  ```
+
+* **Additional Optimizations**:
+
+  Three additional boolean parameters are available to disable certain Torch optimizations that can sometimes cause latency regressions in models with complex execution modes and dynamic shapes.
+  If not specified, all are enabled by default.
 
     `ENABLE_JIT_EXECUTOR`
 
     `ENABLE_JIT_PROFILING`
 
-    `ENABLE_TENSOR_FUSER`
-
-### Important Note
-
-* The execution of pytorch model on GPU is asynchronous in nature. See
-  [here](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution)
-  for more details. Consequently, an error in pytorch model execution may
-  be raised during the next few inference requests to the server. Setting
-  environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will
-  help in correctly debugging failing cases by forcing synchronous execution.
-  * The PyTorch model in such cases may or may not recover from the failed
-    state and a restart of the server may be required to continue serving
-    successfully.
-
-* Multiple instances of the pytorch model on GPU do not always
-  increase performance. Due to thread specific caching in pytorch, using
-  multiple instances of the model interact negatively. See
-  [here](https://github.com/pytorch/pytorch/issues/27902) for more details.
-  Setting the parameter `DISABLE_OPTIMIZED_EXECUTION` to "true" in the model
-  configuration may help in some cases to avoid these negative interactions
-  due to model specific caching and increase multiple instance performance.
+### Model Instance Group Kind
+
+The PyTorch backend supports the following kinds of
+[Model Instance Groups](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
+where the input tensors are placed as follows:
+
+* `KIND_GPU`:
+
+  Inputs are prepared on the GPU device associated with the model instance.
+
+* `KIND_CPU`:
+
+  Inputs are prepared on the CPU.
+
+* `KIND_MODEL`:
+
+  Inputs are prepared on the CPU.
+  When loading the model, the backend does not choose the GPU device for the model;
+  instead, it respects the device(s) specified in the model and uses them as they are during inference.
+
+  This is useful when the model internally utilizes multiple GPUs, as demonstrated in
+  [this example model](https://github.com/triton-inference-server/server/blob/main/qa/L0_libtorch_instance_group_kind_model/gen_models.py).
+
+  > [!IMPORTANT]
+  > If a device is not specified in the model, the backend uses the first available GPU device.
+
+To set the model instance group, use the configuration example below:
+
+```proto
+instance_group {
+   count: 2
+   kind: KIND_GPU
+}
+```
+
+### Customization
+
+The following PyTorch settings may be customized by setting parameters on the
+`config.pbtxt`.
+
+[`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads)
+
+* Key: `NUM_THREADS`
+* Value: The number of threads used for intra-op parallelism on CPU.
+
+[`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads)
+
+* Key: `NUM_INTEROP_THREADS`
+* Value: The number of threads used for interop parallelism (e.g. in JIT interpreter) on CPU.
+
+[`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile)
+
+* Key: `TORCH_COMPILE_OPTIONAL_PARAMETERS`
+* Value: Any of following parameter(s) encoded as a JSON object.
+  * `fullgraph` (`bool`): Whether it is ok to break model into several subgraphs.
+  * `dynamic` (`bool`): Use dynamic shape tracing.
+  * `backend` (`str`): The backend to be used.
+  * `mode` (`str`): Can be either `"default"`, `"reduce-overhead"`, or `"max-autotune"`.
+  * `options` (`dict`): A dictionary of options to pass to the backend.
+  * `disable` (`bool`): Turn `torch.compile()` into a no-op for testing.
+
+For example:
+
+```proto
+parameters: {
+  key: "NUM_THREADS"
+  value: { string_value: "4" }
+}
+parameters: {
+  key: "TORCH_COMPILE_OPTIONAL_PARAMETERS"
+  value: { string_value: "{\"disable\": true}" }
+}
+```
+
+## Important Notes
+
+* The execution of PyTorch model on GPU is asynchronous in nature.
+  See
+  [CUDA Asynchronous Execution](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution)
+  for additional details.
+  Consequently, an error in PyTorch model execution may be raised during the next few inference requests to the server.
+  Setting environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will help in correctly debugging failing cases by forcing synchronous execution.
+
+  * The PyTorch model in such cases may or may not recover from the failed state and a restart of the server may be required to continue serving successfully.
+
+* PyTorch does not support Tensor of Strings but it does support models that accept a List of Strings as input(s) / produces a List of String as output(s).
+  For these models Triton allows users to pass String input(s)/receive String output(s) using the String datatype.
+  As a limitation of using List instead of Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported for I/O of String type.
+
+* In a multi-GPU environment, a potential runtime issue can occur when using
+  [Tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html)
+  to generate a
+  [TorchScript](https://pytorch.org/docs/stable/jit.html)
+  model.
+  This issue arises due to a device mismatch between the model instance and the tensor.
+
+  By default, Triton creates a single execution instance of the model for each available GPU.
+  The runtime error occurs when a request is sent to a model instance with a different GPU device from the one used during the TorchScript generation process.
+
+  To address this problem, it is highly recommended to use
+  [Scripting](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script)
+  instead of Tracing for model generation in a multi-GPU environment.
+  Scripting avoids the device mismatch issue and ensures compatibility with different GPUs when used with Triton.
+
+  However, if using Tracing is unavoidable, there is a workaround available.
+  You can explicitly specify the GPU device for the model instance in the
+  [model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
+  to ensure that the model instance and the tensors used for inference are assigned to the same GPU device as on which the model was traced.
+
+* When using `KIND_MODEL` as model instance kind, the default device of the first parameter on the model is used.
+
+> [!WARNING]
+>
+> * Python functions optimizable by `torch.compile` may not be served directly in the `model.py` file, they need to be enclosed by a class extending the
+  [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module).
+>
+> * Model weights cannot be shared across multiple instances on the same GPU device.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..1a8da1f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,49 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+[tool.codespell]
+# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
+# this is only to allow you to run codespell interactively
+skip = "./.git,./.github"
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
+# use the 'clear' dictionary for unambiguous spelling mistakes
+builtin = "clear"
+# disable warnings about binary files and wrong encoding
+quiet-level = 3
+
+[tool.isort]
+profile = "black"
+use_parentheses = true
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+ensure_newline_before_comments = true
+line_length = 88
+balanced_wrapping = true
+indent = "    "
+skip = ["build"]
+
diff --git a/src/libtorch.cc b/src/libtorch.cc
index 6934a6c..500f1f5 100644
--- a/src/libtorch.cc
+++ b/src/libtorch.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -24,1350 +24,13 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include <stdint.h>
-#include <exception>
-#include "libtorch_utils.h"
-#include "triton/backend/backend_common.h"
-#include "triton/backend/backend_input_collector.h"
-#include "triton/backend/backend_memory.h"
-#include "triton/backend/backend_model.h"
-#include "triton/backend/backend_model_instance.h"
-#include "triton/backend/backend_output_responder.h"
-#include "triton/core/tritonbackend.h"
-
-#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
-// Suppress warnings in torch headers
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsign-compare"
-#pragma warning(push, 0)
-#include <torchvision/ops/ops.h>
-#include <torchvision/vision.h>  // Torchvision header
-#pragma warning(pop)
-#pragma GCC diagnostic pop
-#endif  // TRITON_PYTORCH_ENABLE_TORCHVISION
-
-#ifdef TRITON_ENABLE_GPU
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <cuda_runtime_api.h>
-#endif  // TRITON_ENABLE_GPU
+#include "libtorch.hh"
 
 //
 // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
 //
 
-namespace triton { namespace backend { namespace pytorch {
-
-//
-// ModelState
-//
-// State associated with a model that is using this backend. An object
-// of this class is created and associated with each
-// TRITONBACKEND_Model.
-//
-class ModelState : public BackendModel {
- public:
-  static TRITONSERVER_Error* Create(
-      TRITONBACKEND_Model* triton_model, ModelState** state);
-  virtual ~ModelState() = default;
-
-  // Load a TorchScript model using 'artifact_name' as the name for the
-  // TorchScript file. Return in 'model_path' the full path to the
-  // TorchScript file, return in 'torch_model' the Torch Module
-  // representing the model.
-  TRITONSERVER_Error* LoadModel(
-      const std::string& artifact_name, const torch::Device device,
-      std::string* model_path,
-      std::shared_ptr<torch::jit::script::Module>* torch_model);
-
-  bool EnabledOptimizedExecution() { return enable_optimized_execution_; }
-  const std::pair<bool, bool>& EnabledTensorExprFuser() const
-  {
-    return enable_tensor_fuser_pair_;
-  }
-  const std::pair<bool, bool>& EnabledJitProfiling() const
-  {
-    return enable_jit_profiling_pair_;
-  }
-  const std::pair<bool, bool>& EnabledJitExecutor() const
-  {
-    return enable_jit_executor_pair_;
-  }
-  bool EnabledInferenceMode() { return enable_inference_mode_; }
-  const std::pair<bool, bool>& EnabledNvfuserPair() const
-  {
-    return enable_nvfuser_pair_;
-  }
-
-  bool EnabledWeightSharing() { return enable_weight_sharing_; }
-
- private:
-  ModelState(TRITONBACKEND_Model* triton_model);
-  TRITONSERVER_Error* AutoCompleteConfig();
-
-  // Parses and validates parameters in config
-  TRITONSERVER_Error* ParseParameters();
-
-  // Flag to indicate whether optimized execution is enabled. Defaults to true.
-  bool enable_optimized_execution_;
-
-  // Flag to indicate whether inference mode is enabled. Defaults to false.
-  bool enable_inference_mode_;
-
-  // Flag to indicate whether weight sharing is enabled. Defaults to false.
-  bool enable_weight_sharing_;
-
-  // Flag pairs to indicate if various JIT settings are set and
-  // enabled respectively. Defaults to (false, true). Default behavior
-  // is to do nothing if not explicitly set. Tensor fuser flag is
-  // ignore if nvfuser is explicitly set.
-  std::pair<bool, bool> enable_tensor_fuser_pair_;
-  std::pair<bool, bool> enable_jit_profiling_pair_;
-  std::pair<bool, bool> enable_jit_executor_pair_;
-
-  // Flag pair to indicate whether nvfuser is set and enabled respectively.
-  // Defaults to (false, false).
-  std::pair<bool, bool> enable_nvfuser_pair_;
-
-  // Model mapping for shared TorchScript model across all instances on the
-  // same device. The key is a pair of isGPU and device index.
-  std::map<
-      std::pair<bool, int64_t>, std::shared_ptr<torch::jit::script::Module>>
-      torch_models_;
-};
-
-TRITONSERVER_Error*
-ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
-{
-  try {
-    *state = new ModelState(triton_model);
-  }
-  catch (const BackendModelException& ex) {
-    RETURN_ERROR_IF_TRUE(
-        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
-        std::string("unexpected nullptr in BackendModelException"));
-    RETURN_IF_ERROR(ex.err_);
-  }
-
-  // Auto-complete the configuration if requested...
-  bool auto_complete_config = false;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(
-      triton_model, &auto_complete_config));
-  if (auto_complete_config) {
-    RETURN_IF_ERROR((*state)->AutoCompleteConfig());
-
-    triton::common::TritonJson::WriteBuffer json_buffer;
-    (*state)->ModelConfig().Write(&json_buffer);
-
-    TRITONSERVER_Message* message;
-    RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(
-        &message, json_buffer.Base(), json_buffer.Size()));
-    RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(
-        triton_model, 1 /* config_version */, message));
-  }
-
-  RETURN_IF_ERROR((*state)->ParseParameters());
-
-  return nullptr;  // success
-}
-
-ModelState::ModelState(TRITONBACKEND_Model* triton_model)
-    : BackendModel(triton_model), enable_optimized_execution_(true),
-      enable_inference_mode_(false), enable_weight_sharing_(false),
-      enable_tensor_fuser_pair_({false, true}),
-      enable_jit_profiling_pair_({false, true}),
-      enable_jit_executor_pair_({false, true}),
-      enable_nvfuser_pair_({false, false})
-{
-}
-
-TRITONSERVER_Error*
-ModelState::LoadModel(
-    const std::string& artifact_name, const torch::Device device,
-    std::string* model_path,
-    std::shared_ptr<torch::jit::script::Module>* torch_model)
-{
-  // Find the TorchScript file that describes the model. If the model
-  // configuration doesn't have an explicit model file specified then
-  // use the default name ("model.pt").
-  std::string cc_model_filename = artifact_name;
-  if (cc_model_filename.empty()) {
-    cc_model_filename = "model.pt";
-  }
-
-  *model_path = JoinPath(
-      {RepositoryPath(), std::to_string(Version()), cc_model_filename});
-
-  {
-    bool exists;
-    RETURN_IF_ERROR(FileExists(*model_path, &exists));
-    RETURN_ERROR_IF_FALSE(
-        exists, TRITONSERVER_ERROR_UNAVAILABLE,
-        std::string("unable to find '") + *model_path +
-            "' for model instance '" + Name() + "'");
-  }
-
-  // If weight sharing is enabled, skip loading model if
-  // it is already available on the target device
-  std::pair<bool, int> device_pair;
-  if (enable_weight_sharing_) {
-    device_pair = std::make_pair(!device.is_cpu(), device.index());
-    auto mit = torch_models_.find(device_pair);
-    if (mit != torch_models_.end()) {
-      *torch_model = mit->second;
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Reusing TorchScript model for instance '") + Name() +
-           "'")
-              .c_str());
-      return nullptr;  // success
-    }
-  }
-
-  // Serialize the torch model to string
-  std::string model_data_str;
-  RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str));
-
-  // InferenceMode should be used to guard all tensors operations including
-  // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html
-  torch::InferenceMode infer_guard(EnabledInferenceMode());
-
-  try {
-    std::istringstream model_stream(model_data_str);
-    torch_model->reset(
-        new torch::jit::Module(torch::jit::load(model_stream, device)));
-  }
-  catch (const std::exception& ex) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        ("failed to load model '" + Name() + "': " + ex.what()).c_str());
-  }
-
-  if (enable_weight_sharing_) {
-    if (!((torch_models_.emplace(device_pair, *torch_model)).second)) {
-      std::string type = device.is_cpu() ? "CPU" : "GPU";
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_WARN,
-          (std::string("Model already found on target ") + type + " device " +
-           "(id " + std::to_string(device.index()) + ") for '" + Name() + "'")
-              .c_str());
-    }
-  }
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelState::AutoCompleteConfig()
-{
-  // Auto-complete configuration is not supported since PyTorch does not
-  // store/capture sufficient model metadata so just log error instead.
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_WARN,
-      (std::string("skipping model configuration auto-complete for '") +
-       Name() + "': not supported for pytorch backend")
-          .c_str());
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelState::ParseParameters()
-{
-  triton::common::TritonJson::Value params;
-  bool status = model_config_.Find("parameters", &params);
-  if (status) {
-    // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no
-    // update is made to 'enable_optimized_execution_'.
-    bool disable_optimized_execution = false;
-    TRITONSERVER_Error* err = ParseParameter(
-        params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    }
-    enable_optimized_execution_ = !disable_optimized_execution;
-
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO,
-        (std::string("Optimized execution is ") +
-         (enable_optimized_execution_ ? "enabled" : "disabled") +
-         " for model instance '" + Name() + "'")
-            .c_str());
-
-    // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made
-    // to 'enable_inference_mode_'.
-    err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    }
-
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO,
-        (std::string("Inference Mode is ") +
-         (enable_inference_mode_ ? "enabled" : "disabled") +
-         " for model instance '" + Name() + "'")
-            .c_str());
-
-    // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no
-    // update is made to 'enable_tensor_fuser'.
-    bool enable_tensor_fuser = false;
-    err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      enable_tensor_fuser_pair_ = {true, enable_tensor_fuser};
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Tensor fuser is ") +
-           (enable_tensor_fuser ? "enabled" : "disabled") +
-           " for model instance '" + Name() + "'")
-              .c_str());
-    }
-
-    // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no
-    // update is made to 'enable_weight_sharing'.
-    err = ParseParameter(
-        params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Weight sharing is ") +
-           (enable_weight_sharing_ ? "enabled" : "disabled") +
-           " for model instance '" + Name() + "'")
-              .c_str());
-    }
-
-    // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update
-    // is made to 'enable_jit_profiling'.
-    bool enable_jit_profiling = false;
-    err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      enable_jit_profiling_pair_ = {true, enable_jit_profiling};
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Jit profiling is ") +
-           (enable_jit_profiling ? "enabled" : "disabled") +
-           " for model instance '" + Name() + "'")
-              .c_str());
-    }
-
-    // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is
-    // made to 'enable_jit_executor'.
-    bool enable_jit_executor = false;
-    err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      enable_jit_executor_pair_ = {true, enable_jit_executor};
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Jit executor is ") +
-           (enable_jit_executor ? "enabled" : "disabled") +
-           " for model instance '" + Name() + "'")
-              .c_str());
-    }
-
-    // TODO Re-enable NvFuser once fixed
-    // If 'ENABLE_NVFUSER' is not present in 'parameters' then no
-    // update is made to 'enable_nvfuser'.
-    bool enable_nvfuser = false;
-    err = ParseParameter(params, "ENABLE_NVFUSER", &enable_nvfuser);
-    if (err != nullptr) {
-      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
-        return err;
-      } else {
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_INFO, (std::string("NvFuser is not specified") +
-                                    " for model instance '" + Name() + "'")
-                                       .c_str());
-        TRITONSERVER_ErrorDelete(err);
-      }
-    } else {
-      // Override, disable NvFuser till fixed
-      enable_nvfuser = false;
-      enable_nvfuser_pair_ = {true, enable_nvfuser};
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_WARN, (std::string("NvFuser is ") +
-                                  (enable_nvfuser ? "enabled" : "disabled") +
-                                  " for model instance '" + Name() + "'")
-                                     .c_str());
-    }
-  }
-
-  return nullptr;
-}
-
-//
-// ModelInstanceState
-//
-// State associated with a model instance. An object of this class is
-// created and associated with each TRITONBACKEND_ModelInstance.
-//
-class ModelInstanceState : public BackendModelInstance {
- public:
-  static TRITONSERVER_Error* Create(
-      ModelState* model_state,
-      TRITONBACKEND_ModelInstance* triton_model_instance,
-      ModelInstanceState** state);
-  virtual ~ModelInstanceState();
-
-  // Get the state of the model that corresponds to this instance.
-  ModelState* StateForModel() const { return model_state_; }
-
-  // Execute...
-  void ProcessRequests(
-      TRITONBACKEND_Request** requests, const uint32_t request_count);
-
- private:
-  ModelInstanceState(
-      ModelState* model_state,
-      TRITONBACKEND_ModelInstance* triton_model_instance);
-  TRITONSERVER_Error* ValidateBooleanSequenceControl(
-      triton::common::TritonJson::Value& sequence_batching,
-      const std::string& control_kind, bool required, bool* have_control);
-  TRITONSERVER_Error* ValidateTypedSequenceControl(
-      triton::common::TritonJson::Value& sequence_batching,
-      const std::string& control_kind, bool required, bool* have_control);
-  TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt);
-  TRITONSERVER_Error* ValidateOutputs();
-  void Execute(
-      std::vector<TRITONBACKEND_Response*>* responses,
-      const uint32_t response_count,
-      std::vector<torch::jit::IValue>* input_tensors,
-      std::vector<torch::Tensor>* output_tensors);
-  TRITONSERVER_Error* SetInputTensors(
-      size_t total_batch_size, TRITONBACKEND_Request** requests,
-      const uint32_t request_count,
-      std::vector<TRITONBACKEND_Response*>* responses,
-      BackendInputCollector* collector, std::vector<const char*>* input_names,
-      std::vector<torch::jit::IValue>* input_tensors,
-      std::vector<BackendMemory*>* input_memories, bool* cuda_copy);
-  TRITONSERVER_Error* ReadOutputTensors(
-      size_t total_batch_size, const std::vector<const char*>& output_names,
-      const std::vector<torch::Tensor>& output_tensors,
-      TRITONBACKEND_Request** requests, const uint32_t request_count,
-      std::vector<TRITONBACKEND_Response*>* responses,
-      uint64_t* compute_end_ns);
-
-  ModelState* model_state_;
-
-  // The full path to the TorchScript model file.
-  std::string model_path_;
-
-  std::shared_ptr<torch::jit::script::Module> torch_model_;
-  torch::Device device_;
-
-  // Map from configuration name for an input to the index of
-  // that input in the model.
-  std::unordered_map<std::string, int> input_index_map_;
-
-  // Map from configuration name for an output to the index of
-  // that output in the model.
-  std::unordered_map<std::string, int> output_index_map_;
-  std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
-
-  // If the input to the tensor is a dictionary of tensors.
-  bool is_dict_input_;
-};
-
-TRITONSERVER_Error*
-ModelInstanceState::Create(
-    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
-    ModelInstanceState** state)
-{
-  try {
-    *state = new ModelInstanceState(model_state, triton_model_instance);
-  }
-  catch (const BackendModelInstanceException& ex) {
-    RETURN_ERROR_IF_TRUE(
-        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
-        std::string("unexpected nullptr in BackendModelInstanceException"));
-    RETURN_IF_ERROR(ex.err_);
-  }
-
-  return nullptr;  // success
-}
-
-ModelInstanceState::ModelInstanceState(
-    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
-    : BackendModelInstance(model_state, triton_model_instance),
-      model_state_(model_state), device_(torch::kCPU), is_dict_input_(false)
-{
-  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
-    device_ = torch::Device(torch::kCUDA, DeviceId());
-  }
-
-  THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(
-      ArtifactFilename(), device_, &model_path_, &torch_model_));
-
-  size_t expected_input_cnt = 0;
-  {
-    triton::common::TritonJson::Value inputs;
-    if (model_state->ModelConfig().Find("input", &inputs)) {
-      expected_input_cnt = inputs.ArraySize();
-    }
-  }
-
-  // If this is a sequence model then make sure that the required
-  // inputs are present in the model and have the correct shape and
-  // datatype.
-  triton::common::TritonJson::Value sequence_batching;
-  if (model_state->ModelConfig().Find(
-          "sequence_batching", &sequence_batching)) {
-    bool have_start, have_end, have_ready, have_corrid;
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
-        sequence_batching, "CONTROL_SEQUENCE_START", false /* required */,
-        &have_start));
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
-        sequence_batching, "CONTROL_SEQUENCE_END", false /* required */,
-        &have_end));
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
-        sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */,
-        &have_ready));
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl(
-        sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */,
-        &have_corrid));
-    if (have_start) {
-      expected_input_cnt += 1;
-    }
-    if (have_end) {
-      expected_input_cnt += 1;
-    }
-    if (have_ready) {
-      expected_input_cnt += 1;
-    }
-    if (have_corrid) {
-      expected_input_cnt += 1;
-    }
-  }
-
-  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt));
-  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
-}
-
-ModelInstanceState::~ModelInstanceState()
-{
-  torch_model_.reset();
-#ifdef TRITON_ENABLE_GPU
-  if (device_.is_cuda()) {
-    c10::cuda::CUDACachingAllocator::emptyCache();
-  }
-#endif  // TRITON_ENABLE_GPU
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ValidateBooleanSequenceControl(
-    triton::common::TritonJson::Value& sequence_batching,
-    const std::string& control_kind, bool required, bool* have_control)
-{
-  std::string tensor_name;
-  std::string tensor_datatype;
-  RETURN_IF_ERROR(GetBooleanSequenceControlProperties(
-      sequence_batching, model_state_->Name(), control_kind, required,
-      &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr,
-      nullptr, nullptr));
-  *have_control = !tensor_name.empty();
-  if (*have_control) {
-    std::string deliminator = "__";
-    int ip_index = 0;
-    try {
-      int start_pos = tensor_name.find(deliminator);
-      if (start_pos == -1) {
-        throw std::invalid_argument("input must follow naming convention");
-      }
-      ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
-      input_index_map_[tensor_name] = ip_index;
-    }
-    catch (std::exception& ex) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("input '" + tensor_name +
-           "' does not follow naming convention i.e. <name>__<index>.")
-              .c_str());
-    }
-  }
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ValidateTypedSequenceControl(
-    triton::common::TritonJson::Value& sequence_batching,
-    const std::string& control_kind, bool required, bool* have_control)
-{
-  std::string tensor_name;
-  std::string tensor_datatype;
-  RETURN_IF_ERROR(GetTypedSequenceControlProperties(
-      sequence_batching, model_state_->Name(), control_kind, required,
-      &tensor_name, &tensor_datatype));
-  *have_control = !tensor_name.empty();
-  if (*have_control) {
-    std::string deliminator = "__";
-    int ip_index = 0;
-    try {
-      int start_pos = tensor_name.find(deliminator);
-      if (start_pos == -1) {
-        throw std::invalid_argument("input must follow naming convention");
-      }
-      ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
-      input_index_map_[tensor_name] = ip_index;
-    }
-    catch (std::exception& ex) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("input '" + tensor_name +
-           "' does not follow naming convention i.e. <name>__<index>.")
-              .c_str());
-    }
-  }
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
-{
-  // Collect all the expected input tensor names and validate that the model
-  // configuration specifies only those.
-  std::set<std::string> allowed_inputs;
-
-  const torch::jit::Method& method = torch_model_->get_method("forward");
-  const auto& schema = method.function().getSchema();
-  const std::vector<c10::Argument>& arguments = schema.arguments();
-
-  // Currently, only models with a single input of type Dict(str, Tensor) are
-  // supported. If the model expects more than one input then they must be all
-  // be of type Tensor.
-  //
-  // Ignore the argument at idx 0 if it is of Class type (self param in forward
-  // function)
-  size_t start_idx = 0;
-  if ((arguments.size() > 0) &&
-      (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) {
-    start_idx = 1;
-  }
-  if ((arguments.size() == (1 + start_idx)) &&
-      (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) {
-    is_dict_input_ = true;
-  } else if (arguments.size() > start_idx) {
-    // Return error if multiple inputs are of kind DictType
-    for (size_t i = start_idx + 1; i < arguments.size(); i++) {
-      if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            "Multiple inputs of kind DictType were detected. Only a single "
-            "input of type Dict(str, Tensor) is supported.");
-      }
-    }
-
-    // Return error if all inputs are not of type Tensor
-    for (size_t i = start_idx; i < arguments.size(); i++) {
-      if (arguments.at(i).type()->kind() != c10::TypeKind::TensorType) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            (std::string("An input of type '") + arguments.at(i).type()->str() +
-             "' was detected in the model. Only a single input of type "
-             "Dict(str, Tensor) or input(s) of type Tensor are supported.")
-                .c_str());
-      }
-      allowed_inputs.emplace(arguments.at(i).name());
-    }
-
-    // If all inputs are tensors, match number of expected inputs between model
-    // and configuration
-    if ((arguments.size() - start_idx) != expected_input_cnt) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("unable to load model '") + model_state_->Name() +
-           "', configuration expects " + std::to_string(expected_input_cnt) +
-           " inputs, model provides " +
-           std::to_string(arguments.size() - start_idx))
-              .c_str());
-    }
-  }
-
-  triton::common::TritonJson::Value ios;
-  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios));
-  std::string deliminator = "__";
-  int ip_index = 0;
-
-  if (ios.ArraySize() == 0) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        "model configuration must contain at least one input, none were "
-        "specified.");
-  }
-
-  for (size_t i = 0; i < ios.ArraySize(); i++) {
-    triton::common::TritonJson::Value io;
-    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
-
-    // Validate name
-    std::string io_name;
-    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-    if (is_dict_input_) {
-      // If dictionary, index is irrelevant but we use the map to store the
-      // input names since they are the keys for the dictionary
-      input_index_map_[io_name] = i;
-    } else {
-      // input tensor name must be in 'allowed_inputs' or must follow the naming
-      // convention
-      auto itr = allowed_inputs.find(io_name);
-      if (itr != allowed_inputs.end()) {
-        input_index_map_[io_name] = std::distance(allowed_inputs.begin(), itr);
-      } else {
-        try {
-          int start_pos = io_name.find(deliminator);
-          if (start_pos == -1) {
-            throw std::invalid_argument("input must follow naming convention");
-          }
-          ip_index = std::atoi(io_name.substr(start_pos + 2).c_str());
-          input_index_map_[io_name] = ip_index;
-        }
-        catch (std::exception& ex) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              ("input '" + io_name +
-               "' is neither an input argument to the model nor does it "
-               "follow the naming convention i.e. <name>__<index>.")
-                  .c_str());
-        }
-      }
-    }
-
-    // Validate data type
-    std::string io_dtype;
-    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
-    const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
-    if (!pr.first) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("unsupported datatype " + io_dtype + " for input '" + io_name +
-           "' for model '" + model_state_->Name() + "'")
-              .c_str());
-    }
-  }
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ValidateOutputs()
-{
-  triton::common::TritonJson::Value ios;
-  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
-  std::string deliminator = "__";
-  int op_index = 0;
-
-  if (ios.ArraySize() == 0) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        "model configuration must contain at least one output, none were "
-        "specified.");
-  }
-
-  for (size_t i = 0; i < ios.ArraySize(); i++) {
-    triton::common::TritonJson::Value io;
-    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
-
-    // Validate name
-    std::string io_name;
-    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
-    try {
-      int start_pos = io_name.find(deliminator);
-      if (start_pos == -1) {
-        throw std::invalid_argument("output must follow naming convention");
-      }
-      op_index = std::atoi(io_name.substr(start_pos + 2).c_str());
-    }
-    catch (std::exception& ex) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("output '" + io_name +
-           "' does not follow naming convention i.e. <name>__<index>.")
-              .c_str());
-    }
-
-    // Validate data type
-    std::string io_dtype;
-    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
-    const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
-    if (!pr.first) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          ("unsupported datatype " + io_dtype + " for output '" + io_name +
-           "' for model '" + model_state_->Name() + "'")
-              .c_str());
-    }
-    output_index_map_[io_name] = op_index;
-    output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second);
-  }
-
-  return nullptr;  // success
-}
-
-void
-ModelInstanceState::ProcessRequests(
-    TRITONBACKEND_Request** requests, const uint32_t request_count)
-{
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " +
-       std::to_string(request_count) + " requests")
-          .c_str());
-
-  uint64_t exec_start_ns = 0;
-  SET_TIMESTAMP(exec_start_ns);
-
-  const int max_batch_size = model_state_->MaxBatchSize();
-
-  // For each request collect the total batch size for this inference
-  // execution. The batch-size, number of inputs, and size of each
-  // input has already been checked so don't need to do that here.
-  size_t total_batch_size = 0;
-  for (size_t i = 0; i < request_count; i++) {
-    // If we get a nullptr request then something is badly wrong. Fail
-    // and release all requests.
-    if (requests[i] == nullptr) {
-      RequestsRespondWithError(
-          requests, request_count,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              std::string(
-                  "null request given to PyTorch backend for '" + Name() + "'")
-                  .c_str()));
-      return;
-    }
-  }
-
-  // At this point we are committed to running inference with all
-  // 'requests'. Create a response for each request. During input
-  // processing if there is an error with any request that error will
-  // be sent immediately with the corresponding response (and the
-  // response unique_ptr will then be nullptr). The request object
-  // itself will not be released until after all inferencing is done
-  // (below) as we may need to access the request object when
-  // determine how to process outputs (for example, even if we don't
-  // need the outputs for a request that has an error, we do need to
-  // know the size of those outputs associated with the request so we
-  // can skip them in the output tensors).
-  std::vector<TRITONBACKEND_Response*> responses;
-  responses.reserve(request_count);
-  bool all_response_failed = false;
-
-  for (size_t i = 0; i < request_count; i++) {
-    TRITONBACKEND_Response* response;
-    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
-    if (err == nullptr) {
-      responses.emplace_back(response);
-    } else {
-      responses.emplace_back(nullptr);
-      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
-      TRITONSERVER_ErrorDelete(err);
-    }
-  }
-
-
-  for (size_t i = 0; i < request_count; i++) {
-    if (max_batch_size > 0) {
-      // Retrieve the batch size from one of the inputs, if the model
-      // supports batching, the first dimension size is batch size
-      TRITONBACKEND_Input* input;
-      TRITONSERVER_Error* err =
-          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
-      if (err == nullptr) {
-        const int64_t* shape;
-        err = TRITONBACKEND_InputProperties(
-            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
-        total_batch_size += shape[0];
-      }
-      if (err != nullptr) {
-        RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-            responses, request_count, all_response_failed, err);
-      }
-    } else {
-      total_batch_size += 1;
-    }
-  }
-
-  // If there are no valid payloads then no need to run the inference.
-  if (total_batch_size == 0) {
-    return;
-  }
-
-  // Make sure the maximum batch size is not exceeded. The
-  // total_batch_size must be 1 for models that don't support batching
-  // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
-  // scheduler has done something badly wrong so fail and release all
-  // requests.
-  if (!all_response_failed) {
-    if ((total_batch_size != 1) &&
-        (total_batch_size > (size_t)max_batch_size)) {
-      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-          responses, request_count, all_response_failed,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              std::string(
-                  "batch size " + std::to_string(total_batch_size) + " for '" +
-                  Name() + "', max allowed is " +
-                  std::to_string(max_batch_size))
-                  .c_str()));
-    }
-  }
-
-  std::vector<const char*> input_names;
-  std::vector<torch::jit::IValue> input_tensors;
-  std::vector<BackendMemory*> input_memories;
-  bool cuda_copy = false;
-  std::unique_ptr<BackendInputCollector> collector;
-
-  if (!all_response_failed) {
-    collector.reset(new BackendInputCollector(
-        requests, request_count, &responses,
-        model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(),
-        CudaStream(), nullptr, nullptr, 0, HostPolicyName().c_str()));
-    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-        responses, request_count, all_response_failed,
-        SetInputTensors(
-            total_batch_size, requests, request_count, &responses,
-            collector.get(), &input_names, &input_tensors, &input_memories,
-            &cuda_copy));
-  }
-
-  // Request to retrieve all model outputs. 'output_names' and
-  // 'output_tensors' are parallel vectors and so must be kept in
-  // sync.
-  std::vector<const char*> output_names;
-  std::vector<torch::Tensor> output_tensors;
-  if (!all_response_failed) {
-    triton::common::TritonJson::Value ios;
-    TRITONSERVER_Error* err =
-        model_state_->ModelConfig().MemberAsArray("output", &ios);
-    if (err == nullptr) {
-      for (size_t i = 0; i < ios.ArraySize(); i++) {
-        triton::common::TritonJson::Value io;
-        err = ios.IndexAsObject(i, &io);
-        if (err != nullptr) {
-          break;
-        }
-
-        // Use names from ModelConfig by reference since the model
-        // config will persist longer than this inference execution.
-        const char* io_name;
-        size_t io_name_len;
-        err = io.MemberAsString("name", &io_name, &io_name_len);
-        if (err != nullptr) {
-          break;
-        }
-
-        output_names.emplace_back(io_name);
-      }
-    }
-
-    if (err != nullptr) {
-      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-          responses, request_count, all_response_failed, err);
-      output_names.clear();
-    }
-  }
-
-// Wait for any in-flight input tensor copies to complete.
-#ifdef TRITON_ENABLE_GPU
-  if (cuda_copy) {
-    cudaStreamSynchronize(CudaStream());
-  }
-#endif
-
-  uint64_t compute_start_ns = 0;
-  SET_TIMESTAMP(compute_start_ns);
-
-  // Run...
-  if (!all_response_failed) {
-    Execute(&responses, request_count, &input_tensors, &output_tensors);
-  }
-
-  // Free BackendMemory used for inputs
-  for (BackendMemory* mem : input_memories) {
-    if (mem != nullptr) {
-      delete mem;
-    }
-  }
-  input_memories.clear();
-
-  // Verify output indices are valid with number of outputs after execution
-  bool invalid_index = false;
-  int max_index = output_tensors.size() - 1;
-
-  if (!all_response_failed) {
-    for (const auto& name : output_names) {
-      int op_index = output_index_map_[name];
-      if ((op_index < 0) || (op_index > max_index)) {
-        RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-            responses, request_count, all_response_failed,
-            TRITONSERVER_ErrorNew(
-                TRITONSERVER_ERROR_INVALID_ARG,
-                std::string(
-                    "The output " + std::string(name) +
-                    " in the model configuration refers to an output index "
-                    "which"
-                    " doesn't exist. This model has " +
-                    std::to_string(max_index + 1) + " outputs")
-                    .c_str()));
-        invalid_index = true;
-        break;
-      }
-    }
-  }
-
-  uint64_t compute_end_ns = 0;
-
-  if (!all_response_failed) {
-    if (!invalid_index) {
-      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
-          responses, request_count, all_response_failed,
-          ReadOutputTensors(
-              total_batch_size, output_names, output_tensors, requests,
-              request_count, &responses, &compute_end_ns));
-    }
-  }
-
-  uint64_t exec_end_ns = 0;
-  SET_TIMESTAMP(exec_end_ns);
-
-  // Send all the responses that haven't already been sent because of
-  // an earlier error. Note that the responses are not set to nullptr
-  // here as we need that indication below to determine if the request
-  // we successful or not.
-  for (auto& response : responses) {
-    if (response != nullptr) {
-      LOG_IF_ERROR(
-          TRITONBACKEND_ResponseSend(
-              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
-          "failed to send PyTorch backend response");
-    }
-  }
-
-  // Report statistics for each request.
-  for (uint32_t r = 0; r < request_count; ++r) {
-    auto& request = requests[r];
-    LOG_IF_ERROR(
-        TRITONBACKEND_ModelInstanceReportStatistics(
-            TritonModelInstance(), request,
-            (responses[r] != nullptr) /* success */, exec_start_ns,
-            compute_start_ns, compute_end_ns, exec_end_ns),
-        "failed reporting request statistics");
-
-    LOG_IF_ERROR(
-        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
-        "failed releasing request");
-  }
-
-  if (!all_response_failed) {
-    // Report the entire batch statistics.
-    LOG_IF_ERROR(
-        TRITONBACKEND_ModelInstanceReportBatchStatistics(
-            TritonModelInstance(), total_batch_size, exec_start_ns,
-            compute_start_ns, compute_end_ns, exec_end_ns),
-        "failed reporting batch request statistics");
-  }
-}
-
-void
-ModelInstanceState::Execute(
-    std::vector<TRITONBACKEND_Response*>* responses,
-    const uint32_t response_count,
-    std::vector<torch::jit::IValue>* input_tensors,
-    std::vector<torch::Tensor>* output_tensors)
-{
-  torch::jit::IValue model_outputs_;
-
-  try {
-    // enable/disable optimized execution
-    torch::jit::setGraphExecutorOptimize(
-        model_state_->EnabledOptimizedExecution());
-
-    // enable/disable inference mode - supersedes NoGradGuard
-    torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
-
-    // JIT. No change is made unless parameter is explicitly set.
-    if (std::get<0>(model_state_->EnabledJitProfiling())) {
-      torch::jit::getProfilingMode() =
-          std::get<1>(model_state_->EnabledJitProfiling());
-    }
-
-    if (std::get<0>(model_state_->EnabledJitExecutor())) {
-      torch::jit::getExecutorMode() =
-          std::get<1>(model_state_->EnabledJitExecutor());
-    }
-
-    // Fuser. Parameter is ignored if NVFuser parameter is explicitily
-    // set (either enabled or disabled). No change is made unless
-    // fuser is explicitly set in parameters.
-    if (!std::get<0>(model_state_->EnabledNvfuserPair()) &&
-        std::get<0>(model_state_->EnabledTensorExprFuser())) {
-      torch::jit::setTensorExprFuserEnabled(
-          std::get<1>(model_state_->EnabledTensorExprFuser()));
-    }
-
-    // NV-Fuser. No change is made unless parameter is explicitly set.
-    if (std::get<0>(model_state_->EnabledNvfuserPair())) {
-      if (std::get<1>(model_state_->EnabledNvfuserPair()) &&
-          (device_.type() != torch::kCPU)) {
-        torch::jit::overrideCanFuseOnCPU(false);
-        torch::jit::overrideCanFuseOnGPU(false);
-        torch::jit::setTensorExprFuserEnabled(false);
-        torch::jit::RegisterCudaFuseGraph::registerPass(true);
-      } else {
-        torch::jit::overrideCanFuseOnCPU(true);
-        torch::jit::overrideCanFuseOnGPU(true);
-        torch::jit::setTensorExprFuserEnabled(true);
-        torch::jit::RegisterCudaFuseGraph::registerPass(false);
-      }
-    }
-
-    torch::NoGradGuard no_grad;
-
-    // If input is a dictionary, prepare dictionary from 'input_tensors'.
-    if (is_dict_input_) {
-      torch::Dict<std::string, torch::Tensor> input_dict;
-      for (auto& input_index : input_index_map_) {
-        torch::jit::IValue ival = (*input_tensors)[input_index.second];
-        input_dict.insert(input_index.first, ival.toTensor());
-      }
-      std::vector<torch::jit::IValue> input_dict_ivalue = {input_dict};
-      model_outputs_ = torch_model_->forward(input_dict_ivalue);
-    } else {
-      model_outputs_ = torch_model_->forward(*input_tensors);
-    }
-
-    if (model_outputs_.isTuple()) {
-      auto model_outputs_tuple = model_outputs_.toTuple();
-      for (auto& m_op : model_outputs_tuple->elements()) {
-        output_tensors->push_back(m_op.toTensor());
-      }
-    } else {
-      try {
-        auto model_output_tensor = model_outputs_.toTensor();
-        output_tensors->push_back(model_output_tensor);
-      }
-      catch (std::exception& exx) {
-        throw std::invalid_argument(
-            "Output of torch model should be tensor or a tuple of tensors, not "
-            "a list / dictionary of tensors or a scalar: " +
-            std::string(exx.what()));
-      }
-    }
-  }
-  catch (std::exception& ex) {
-    SendErrorForResponses(
-        responses, response_count,
-        TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            ("PyTorch execute failure: " + std::string(ex.what())).c_str()));
-  }
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::SetInputTensors(
-    size_t total_batch_size, TRITONBACKEND_Request** requests,
-    const uint32_t request_count,
-    std::vector<TRITONBACKEND_Response*>* responses,
-    BackendInputCollector* collector, std::vector<const char*>* input_names,
-    std::vector<torch::jit::IValue>* input_tensors,
-    std::vector<BackendMemory*>* input_memories, bool* cuda_copy)
-{
-  const int max_batch_size = model_state_->MaxBatchSize();
-
-  // InferenceMode should be used to guard all tensors operations
-  torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
-
-  // All requests must have equally-sized input tensors so use any
-  // request as the representative for the input tensors.
-  uint32_t input_count;
-  RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count));
-  input_tensors->resize(input_count);
-  for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
-    TRITONBACKEND_Input* input;
-    RETURN_IF_ERROR(
-        TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
-
-    const char* input_name;
-    TRITONSERVER_DataType input_datatype;
-    const int64_t* input_shape;
-    uint32_t input_dims_count;
-    RETURN_IF_ERROR(TRITONBACKEND_InputProperties(
-        input, &input_name, &input_datatype, &input_shape, &input_dims_count,
-        nullptr, nullptr));
-
-    input_names->emplace_back(input_name);
-
-    // The shape for the entire input patch, [total_batch_size, ...]
-    std::vector<int64_t> batchn_shape(
-        input_shape, input_shape + input_dims_count);
-    if (max_batch_size != 0) {
-      batchn_shape[0] = total_batch_size;
-    }
-
-    // The input must be in contiguous CPU/GPU memory.
-    std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
-    if (device_.is_cpu()) {
-      alloc_perference = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
-                          {TRITONSERVER_MEMORY_CPU, 0}};
-    } else {
-      alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
-    }
-
-    const char* input_buffer;
-    size_t batchn_byte_size;
-    TRITONSERVER_MemoryType memory_type;
-    int64_t memory_type_id;
-    RETURN_IF_ERROR(collector->ProcessTensor(
-        input_name, nullptr, 0, alloc_perference, &input_buffer,
-        &batchn_byte_size, &memory_type, &memory_type_id));
-
-    // Create Torch tenor
-    const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype);
-    torch::TensorOptions options{torch_dtype.second};
-    auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU)
-                               ? options.device(torch::kCUDA, device_.index())
-                               : options.device(torch::kCPU);
-
-    // Remove constness to align with the signature of torch::from_blob()
-    torch::Tensor input_tensor = torch::from_blob(
-        const_cast<char*>(input_buffer), batchn_shape, updated_options);
-    (*input_tensors)[input_index_map_[input_name]] = input_tensor;
-  }
-
-  // Finalize...
-  *cuda_copy |= collector->Finalize();
-
-  return nullptr;
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ReadOutputTensors(
-    size_t total_batch_size, const std::vector<const char*>& output_names,
-    const std::vector<torch::Tensor>& output_tensors,
-    TRITONBACKEND_Request** requests, const uint32_t request_count,
-    std::vector<TRITONBACKEND_Response*>* responses, uint64_t* compute_end_ns)
-{
-  BackendOutputResponder responder(
-      requests, request_count, responses, model_state_->TritonMemoryManager(),
-      model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(),
-      CudaStream());
-
-  bool cuda_copy = false;
-  std::vector<std::vector<char>> string_buffers;
-  for (size_t idx = 0; idx < output_names.size(); idx++) {
-    std::string name = output_names[idx];
-    int op_index = output_index_map_[name];
-    torch::Tensor output_flat;
-
-    try {
-      output_flat = output_tensors[op_index].contiguous().flatten();
-    }
-    catch (std::exception& ex) {
-      RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("output tensor '") + name + "' is not found").c_str()));
-    }
-
-    // Verify output datatype matches datatype from model config
-    TRITONSERVER_DataType output_dtype =
-        ConvertTorchTypeToDataType(output_flat.scalar_type());
-    TRITONSERVER_DataType config_datatype = output_dtype_map_[name];
-    if (config_datatype != output_dtype) {
-      RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("configuration expects datatype TYPE_") +
-           TRITONSERVER_DataTypeString(config_datatype) + " for output '" +
-           name + "', model provides TYPE_" +
-           TRITONSERVER_DataTypeString(output_dtype))
-              .c_str()));
-    }
-
-    const char* output_buffer =
-        static_cast<const char*>(output_flat.data_ptr());
-
-    // Output tensors may not reside on the same device as model
-    torch::Device tensor_device = output_flat.device();
-
-    //  Set output shape
-    std::vector<int64_t> batchn_shape;
-    auto shape = output_tensors[op_index].sizes();
-    for (auto itr = shape.begin(); itr != shape.end(); itr++) {
-      batchn_shape.push_back(*itr);
-    }
-
-    if (batchn_shape.size() == 0) {
-      RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INVALID_ARG,
-          (std::string("output '") + name +
-           "' is a scalar which is not supported.")
-              .c_str()));
-    }
-
-    responder.ProcessTensor(
-        name, output_dtype, batchn_shape, output_buffer,
-        (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU
-                                              : TRITONSERVER_MEMORY_GPU,
-        (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index());
-
-    // PyTorch uses asynchronous execution to run the model. Setting the compute
-    // end timestamp immediately after Execute() does not capture the complete
-    // model execution time. When the first output buffer is accessed/copied by
-    // ProcessTensor(), there is a synchronization that is done to ensure the
-    // data is correctly copied from the output tensor. To avoid overheads of
-    // additional synchronization, we continue to use the default cuda stream.
-    // However the drawback of this is that the compute infer time reported
-    // would be slightly later than it is in reality and the compute output time
-    // reported would be smaller than it is in reality. We allow this because
-    // synchronizing manually negatively impacts performance.
-    if (idx == 0) {
-      SET_TIMESTAMP(*compute_end_ns);
-    }
-  }
-
-  // Finalize and wait for any pending buffer copies.
-  cuda_copy |= responder.Finalize();
-
-#ifdef TRITON_ENABLE_GPU
-  if (cuda_copy) {
-    cudaStreamSynchronize(stream_);
-  }
-#endif  // TRITON_ENABLE_GPU
-
-  return nullptr;
-}
-
-/////////////
+namespace triton::backend::pytorch {
 
 extern "C" {
 
@@ -1551,9 +214,13 @@ TRITONBACKEND_ModelInstanceExecute(
   // specific request.
   instance_state->ProcessRequests(requests, request_count);
 
+  if (model_state->EnabledCacheCleaning()) {
+    instance_state->ClearCache();
+  }
+
   return nullptr;  // success
-}
+};
 
 }  // extern "C"
 
-}}}  // namespace triton::backend::pytorch
+}  // namespace triton::backend::pytorch
diff --git a/src/libtorch.hh b/src/libtorch.hh
new file mode 100644
index 0000000..4bd4700
--- /dev/null
+++ b/src/libtorch.hh
@@ -0,0 +1,59 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "model_instance_state.hh"
+#include "model_state.hh"
+#include "naming_convention.hh"
+#include "string_utils.hh"
+
+//
+// PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
+//
+
+namespace triton::backend::pytorch {
+
+extern "C" {
+
+TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(
+    TRITONBACKEND_ModelInstance* instance);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(
+    TRITONBACKEND_ModelInstance* instance);
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+    const uint32_t request_count);
+
+}  // extern "C"
+
+
+}  // namespace triton::backend::pytorch
diff --git a/src/libtorch_utils.cc b/src/libtorch_utils.cc
index a554ba9..bd7353b 100644
--- a/src/libtorch_utils.cc
+++ b/src/libtorch_utils.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-21 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2020-24 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -149,4 +149,31 @@ ParseParameter(
   return nullptr;
 }
 
+TRITONSERVER_Error*
+ParseParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    int* value)
+{
+  std::string value_str;
+  RETURN_IF_ERROR(GetParameterValue(params, mkey, &value_str));
+  RETURN_IF_ERROR(ParseIntValue(value_str, value));
+
+  return nullptr;
+}
+
+
+#ifdef TRITON_ENABLE_GPU
+TRITONSERVER_Error*
+ConvertCUDAStatusToTritonError(
+    cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg)
+{
+  if (cuda_error != cudaSuccess) {
+    return TRITONSERVER_ErrorNew(
+        code,
+        (std::string(msg) + ": " + cudaGetErrorString(cuda_error)).c_str());
+  }
+  return nullptr;  // success
+}
+#endif
+
 }}}  // namespace triton::backend::pytorch
diff --git a/src/libtorch_utils.h b/src/libtorch_utils.h
index e112037..6ec325b 100644
--- a/src/libtorch_utils.h
+++ b/src/libtorch_utils.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -35,7 +35,6 @@
 #pragma warning(push, 0)
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
-#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/script.h>  // One-stop header for TorchScript
@@ -51,11 +50,23 @@ std::pair<bool, torch::ScalarType> ConvertDataTypeToTorchType(
 std::pair<bool, torch::ScalarType> ModelConfigDataTypeToTorchType(
     const std::string& data_type_str);
 
-// If the key 'mkey' is present in 'params' then update 'value' with the value
-// associated with that key. If 'mkey' is not present in 'params' then no update
-// is made to 'value'.
+#ifdef TRITON_ENABLE_GPU
+TRITONSERVER_Error* ConvertCUDAStatusToTritonError(
+    cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg);
+#endif
+
+// If the key 'mkey' is present in 'params' then update 'value' with the
+// value associated with that key. If 'mkey' is not present in 'params' then
+// no update is made to 'value'.
 TRITONSERVER_Error* ParseParameter(
     triton::common::TritonJson::Value& params, const std::string& mkey,
     bool* value);
 
+// If the key 'mkey' is present in 'params' then update 'value' with the
+// value associated with that key. If 'mkey' is not present in 'params' then
+// 'value' is set to 'default_value'.
+TRITONSERVER_Error* ParseParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    int* value);
+
 }}}  // namespace triton::backend::pytorch
diff --git a/src/model.py b/src/model.py
new file mode 100755
index 0000000..d8ed413
--- /dev/null
+++ b/src/model.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import importlib
+import json
+import os
+
+try:
+    import torch
+except ModuleNotFoundError as error:
+    raise RuntimeError("Missing/Incomplete PyTorch package installation") from error
+
+import triton_python_backend_utils as pb_utils
+
+
+def _get_model_path(config):
+    # FIXME: Add support for torch.export IR models (.pt2)
+    filenames = ["model.py", "model.pt"]
+    if config["default_model_filename"]:
+        filenames.insert(0, config["default_model_filename"])
+    for filename in filenames:
+        model_path = os.path.join(pb_utils.get_model_dir(), filename)
+        if os.path.exists(model_path):
+            return model_path
+    raise pb_utils.TritonModelException(
+        "No model found in " + pb_utils.get_model_dir() + "/" + str(filenames)
+    )
+
+
+def _get_model_data_path(model_path):
+    data_path_extensions = [".pt"]
+    model_path_no_extension = model_path[: -(len(model_path.split(".")[-1]) + 1)]
+    for extension in data_path_extensions:
+        data_path = model_path_no_extension + extension
+        if os.path.exists(data_path):
+            return data_path
+    # data file not provided
+    return ""
+
+
+def _is_py_class_model(model_path):
+    return model_path[-3:] == ".py"
+
+
+def _import_module_from_path(module_name, file_path):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def _get_model_class_from_module(module):
+    names = dir(module)
+    for name in names:
+        attr = getattr(module, name)
+        try:
+            if issubclass(attr, torch.nn.Module):
+                return attr
+        except TypeError:
+            # attr may not be a class
+            pass
+    raise pb_utils.TritonModelException("Cannot find a subclass of torch.nn.Module")
+
+
+def _parse_io_config(io_config):
+    io = []
+    for conf in io_config:
+        io.append({"name": conf["name"]})
+    return io
+
+
+def _get_device_name(kind, device_id):
+    if kind == "GPU":
+        return "cuda:" + device_id
+    if kind == "CPU":
+        return "cpu"
+    # unspecified device
+    return ""
+
+
+def _get_device(kind, device_id, model):
+    device_name = _get_device_name(kind, device_id)
+    if device_name == "":
+        for param in model.parameters():
+            return param.device
+        raise pb_utils.TritonModelException("Cannot determine model device")
+    return torch.device(device_name)
+
+
+def _set_torch_parallelism(config):
+    log_msg = ""
+    parallelism_settings = ["NUM_THREADS", "NUM_INTEROP_THREADS"]
+    for setting in parallelism_settings:
+        val = "1"
+        if setting in config["parameters"]:
+            val = config["parameters"][setting]["string_value"]
+        getattr(torch, "set_" + setting.lower())(int(val))
+        log_msg += setting + " = " + val + "; "
+    return log_msg
+
+
+def _get_torch_compile_params(config):
+    params = {}
+    if "TORCH_COMPILE_OPTIONAL_PARAMETERS" in config["parameters"]:
+        val = config["parameters"]["TORCH_COMPILE_OPTIONAL_PARAMETERS"]["string_value"]
+        params = json.loads(val)
+        if "model" in params:
+            raise pb_utils.TritonModelException(
+                "'model' is not an optional parameter for 'torch.compile'"
+            )
+    return params
+
+
+def _gather_torch_tensors(scatter_tensors):
+    gather_tensors = []
+    sections = []
+    for i in range(len(scatter_tensors)):
+        tensors = scatter_tensors[i]
+        for j in range(len(tensors)):
+            tensor = tensors[j]
+            if j < len(gather_tensors):
+                # add to existing tensor
+                gather_tensors[j] = torch.cat((gather_tensors[j], tensor), 0)
+            else:
+                # start a new tensor
+                gather_tensors.append(tensor)
+        # record section
+        section_length = tensors[0].size()[0]
+        sections.append(section_length)
+    return gather_tensors, sections
+
+
+def _scatter_torch_tensors(gather_tensors, sections):
+    scatter_tensors = []
+    for j in range(len(gather_tensors)):
+        scatter_tensor = torch.split(gather_tensors[j], sections)
+        for i in range(len(scatter_tensor)):
+            tensor = scatter_tensor[i]
+            if i < len(scatter_tensors):
+                # add to existing response
+                scatter_tensors[i].append(tensor)
+            else:
+                # start a new response
+                scatter_tensors.append([tensor])
+    return scatter_tensors
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self._model_name = args["model_name"]
+        for_model = "for '" + self._model_name + "'"
+        self._logger = pb_utils.Logger
+        self._logger.log_info("Initializing model instance " + for_model)
+
+        self._model_config = json.loads(args["model_config"])
+        self._kind = args["model_instance_kind"]
+        self._device_id = args["model_instance_device_id"]
+        self._support_batching = self._model_config["max_batch_size"] > 0
+        self._inputs = _parse_io_config(self._model_config["input"])
+        self._outputs = _parse_io_config(self._model_config["output"])
+
+        setting_msg = _set_torch_parallelism(self._model_config)
+        self._logger.log_verbose(
+            "Torch parallelism settings " + for_model + ": " + setting_msg
+        )
+
+        self._infer_mode = torch.inference_mode(mode=True)
+        self._infer_mode.__enter__()
+
+        params = _get_torch_compile_params(self._model_config)
+        self._logger.log_verbose(
+            "'torch.compile' optional parameter(s) " + for_model + ": " + str(params)
+        )
+        if self._support_batching:
+            self._gather = torch.compile(_gather_torch_tensors, **params)
+            self._scatter = torch.compile(_scatter_torch_tensors, **params)
+
+        model_path = _get_model_path(self._model_config)
+        if not _is_py_class_model(model_path):
+            self._logger.log_info("Loading '" + self._model_name + "' as TorchScript")
+            self._model = torch.jit.load(model_path)
+            self._device = _get_device(self._kind, self._device_id, self._model)
+            self._model.to(self._device)
+            self._model.eval()
+            return
+
+        self._model_module = _import_module_from_path(self._model_name, model_path)
+        self._model_class = _get_model_class_from_module(self._model_module)
+        self._raw_model = self._model_class()
+        self._device = _get_device(self._kind, self._device_id, self._raw_model)
+        data_path = _get_model_data_path(model_path)
+        if data_path != "":
+            self._raw_model.load_state_dict(
+                torch.load(data_path, map_location=self._device)
+            )
+        else:
+            self._logger.log_info("Model parameter file not found " + for_model)
+        self._raw_model.to(self._device)
+        self._raw_model.eval()
+        self._model = torch.compile(self._raw_model, **params)
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        responses = []
+
+        requests_tensors = []
+        for request in requests:
+            tensors = []
+            for io in self._inputs:
+                tensor = pb_utils.get_input_tensor_by_name(
+                    request, io["name"]
+                ).to_dlpack()
+                tensor = torch.from_dlpack(tensor).to(self._device)
+                tensors.append(tensor)
+            requests_tensors.append(tensors)
+
+        sections = None
+        if self._support_batching:
+            requests_tensors, sections = self._gather(requests_tensors)
+            requests_tensors = [requests_tensors]
+
+        responses_tensors = []
+        for input_tensors in requests_tensors:
+            output_tensors = self._model(*input_tensors)
+            if not isinstance(output_tensors, tuple) and not isinstance(
+                output_tensors, list
+            ):
+                output_tensors = [output_tensors]
+            responses_tensors.append(output_tensors)
+
+        if self._support_batching:
+            responses_tensors = self._scatter(responses_tensors[0], sections)
+
+        for response_tensors in responses_tensors:
+            output_tensors = []
+            for i in range(len(self._outputs)):
+                io = self._outputs[i]
+                tensor = response_tensors[i].detach()
+                tensor = pb_utils.Tensor.from_dlpack(io["name"], tensor)
+                output_tensors.append(tensor)
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=output_tensors
+            )
+            responses.append(inference_response)
+
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        self._logger.log_info("Removing model instance for '" + self._model_name + "'")
+        self._infer_mode.__exit__(exc_type=None, exc_value=None, traceback=None)
diff --git a/src/model_instance_state.cc b/src/model_instance_state.cc
new file mode 100644
index 0000000..19cae27
--- /dev/null
+++ b/src/model_instance_state.cc
@@ -0,0 +1,1633 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "model_instance_state.hh"
+
+#include "string_utils.hh"
+
+#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
+// Suppress warnings in torch headers
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma warning(push, 0)
+#include <torchvision/ops/ops.h>
+#include <torchvision/vision.h>  // Torchvision header
+#pragma warning(pop)
+#pragma GCC diagnostic pop
+#endif  // TRITON_PYTORCH_ENABLE_TORCHVISION
+
+#ifdef TRITON_ENABLE_GPU
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+
+namespace triton::backend::pytorch {
+
+ModelInstanceState::ModelInstanceState(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
+    : BackendModelInstance(model_state, triton_model_instance),
+      model_state_(model_state), device_(torch::kCPU), is_dict_input_(false),
+      device_cnt_(0)
+{
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    device_ = torch::Device(torch::kCUDA, DeviceId());
+    CreateCudaEvents(DeviceId());
+#endif
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  device_cnt_ = torch::cuda::device_count();
+#endif
+
+  THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(
+      ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_));
+
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+#ifdef TRITON_ENABLE_GPU
+    // Since we cannot determine the exact devices used by the model, we create
+    // a CUDA stream for every available device to ensure proper synchronization
+    // of CUDA streams. This approach may have implications when a timestamp is
+    // captured on a device that is not used by the model. Currently, this issue
+    // is addressed by synchronizing the CUDA streams before recording
+    // timestamps to prevent timestamp skewing. However, in the future, any
+    // modifications to the CUDA stream synchronization logic should be handled
+    // with caution.
+    for (int i = 0; i < device_cnt_; i++) {
+      cudaStream_t stream;
+      THROW_IF_BACKEND_INSTANCE_ERROR(
+          CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream));
+      stream_vec_.push_back(stream);
+    }
+    if (!stream_vec_.empty()) {
+      // Create CUDA events on the first device that will be used for collecting
+      // inputs/outputs.
+      CreateCudaEvents(0);
+    }
+#endif
+  }
+
+  size_t expected_input_cnt = 0;
+  {
+    triton::common::TritonJson::Value inputs;
+    if (model_state->ModelConfig().Find("input", &inputs)) {
+      expected_input_cnt = inputs.ArraySize();
+    }
+
+    triton::common::TritonJson::Value config_batch_inputs;
+    if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) {
+      batch_input_count_ = config_batch_inputs.ArraySize();
+      expected_input_cnt += batch_input_count_;
+    }
+  }
+
+  // If this is a sequence model then make sure that the required
+  // inputs are present in the model and have the correct shape and
+  // datatype.
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    bool have_start, have_end, have_ready, have_corrid;
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_START", false /* required */,
+        &have_start));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_END", false /* required */,
+        &have_end));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */,
+        &have_ready));
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl(
+        sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */,
+        &have_corrid));
+    if (have_start) {
+      expected_input_cnt += 1;
+    }
+    if (have_end) {
+      expected_input_cnt += 1;
+    }
+    if (have_ready) {
+      expected_input_cnt += 1;
+    }
+    if (have_corrid) {
+      expected_input_cnt += 1;
+    }
+    // Add the state inputs to the expected count
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      expected_input_cnt += states.ArraySize();
+    }
+  }
+  supports_batching_ = model_state_->MaxBatchSize() > 0;
+
+  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt));
+  THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
+}
+
+ModelInstanceState::~ModelInstanceState()
+{
+  torch_model_.reset();
+  ClearCache();
+
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+#ifdef TRITON_ENABLE_GPU
+    for (size_t i = 0; i < stream_vec_.size(); i++) {
+      LOG_IF_ERROR(
+          ConvertCUDAStatusToTritonError(
+              cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL,
+              "Failed to set the device"),
+          "Failed to set the device");
+
+      LOG_IF_ERROR(
+          ConvertCUDAStatusToTritonError(
+              cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL,
+              "Failed to destroy cuda stream"),
+          "~ModelInstanceState error: ");
+      stream_vec_[i] = nullptr;
+    }
+#endif
+  }
+}
+
+void
+ModelInstanceState::AddInputToMap(
+    NamingConvention naming_convention,
+    const std::vector<std::string> allowed_inputs, const std::string& io_name,
+    const uint32_t index)
+{
+  std::string deliminator = "__";
+
+  if (is_dict_input_) {
+    // If dictionary, index is irrelevant but we use the map to store the
+    // input names since they are the keys for the dictionary
+    input_index_map_[io_name] = index;
+  } else {
+    switch (naming_convention) {
+      case NamingConvention::FORWARD_ARGUMENT: {
+        auto itr =
+            std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name);
+        if (itr != allowed_inputs.end()) {
+          input_index_map_[io_name] =
+              std::distance(allowed_inputs.begin(), itr);
+        }
+        return;
+      }
+      case NamingConvention::NAMED_INDEX: {
+        int start_pos = io_name.find(deliminator);
+        int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str());
+        input_index_map_[io_name] = ip_index;
+        return;
+      }
+      case NamingConvention::STRICT_CONFIG_ORDERING: {
+        input_index_map_[io_name] = index;
+        return;
+      }
+    }
+  }
+}
+
+void
+ModelInstanceState::ClearCache()
+{
+#ifdef TRITON_ENABLE_GPU
+  if (device_.is_cuda() ||
+      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
+    c10::cuda::CUDACachingAllocator::emptyCache();
+  }
+#endif  // TRITON_ENABLE_GPU
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::Create(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
+    ModelInstanceState** state)
+{
+  try {
+    *state = new ModelInstanceState(model_state, triton_model_instance);
+  }
+  catch (const BackendModelInstanceException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelInstanceException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  return nullptr;  // success
+}
+
+void
+ModelInstanceState::Execute(
+    std::vector<TRITONBACKEND_Response*>* responses,
+    const uint32_t response_count,
+    std::vector<torch::jit::IValue>* input_tensors,
+    std::vector<torch::jit::IValue>* output_tensors)
+{
+  NVTX_RANGE(nvtx_, "Execute " + Name());
+
+  torch::jit::IValue model_outputs_;
+
+  try {
+    // enable/disable optimized execution
+    torch::jit::setGraphExecutorOptimize(
+        model_state_->EnabledOptimizedExecution());
+
+    // enable/disable inference mode - supersedes NoGradGuard
+    torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
+
+    // enable/disable cudnn
+    at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn());
+
+    // JIT. No change is made unless parameter is explicitly set.
+    if (std::get<0>(model_state_->EnabledJitProfiling())) {
+      torch::jit::getProfilingMode() =
+          std::get<1>(model_state_->EnabledJitProfiling());
+    }
+
+    if (std::get<0>(model_state_->EnabledJitExecutor())) {
+      torch::jit::getExecutorMode() =
+          std::get<1>(model_state_->EnabledJitExecutor());
+    }
+
+    // Fuser. No change is made unless fuser is explicitly set in
+    // parameters.
+    if (std::get<0>(model_state_->EnabledTensorExprFuser())) {
+      torch::jit::setTensorExprFuserEnabled(
+          std::get<1>(model_state_->EnabledTensorExprFuser()));
+    }
+
+    torch::NoGradGuard no_grad;
+
+    // If input is a dictionary, prepare dictionary from 'input_tensors'.
+    if (is_dict_input_) {
+      torch::Dict<std::string, torch::Tensor> input_dict;
+      for (auto& input_index : input_index_map_) {
+        torch::jit::IValue ival = (*input_tensors)[input_index.second];
+        input_dict.insert(input_index.first, ival.toTensor());
+      }
+      std::vector<torch::jit::IValue> input_dict_ivalue = {input_dict};
+      model_outputs_ = torch_model_->forward(input_dict_ivalue);
+    } else {
+      model_outputs_ = torch_model_->forward(*input_tensors);
+    }
+
+    if (model_outputs_.isTuple()) {
+      auto model_outputs_tuple = model_outputs_.toTuple();
+      size_t op_index = 0;
+      for (auto& m_op : model_outputs_tuple->elements()) {
+        if (m_op.isList()) {
+          auto list_output = m_op.toList();
+          if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
+            throw std::invalid_argument(
+                "output at index " + std::to_string(op_index) +
+                " must be of type Tensor or List[str], received List[" +
+                list_output.elementType()->str() + "]");
+          }
+          output_tensors->push_back(m_op);
+        } else {
+          auto tensor_output = m_op.toTensor();
+          output_tensors->push_back(m_op);
+        }
+        op_index++;
+      }
+    } else if (model_outputs_.isTensor()) {
+      output_tensors->push_back(model_outputs_);
+    } else if (model_outputs_.isList()) {
+      auto list_output = model_outputs_.toList();
+      if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
+        throw std::invalid_argument(
+            "output must be of type Tensor or List[str], received List[" +
+            list_output.elementType()->str() + "]");
+      }
+      output_tensors->push_back(model_outputs_);
+    } else {
+      throw std::invalid_argument(
+          "output must be of type Tensor, List[str] or Tuple containing one of "
+          "these two types. It should not be a List / Dictionary of Tensors or "
+          "a Scalar");
+    }
+  }
+  catch (std::exception& ex) {
+    SendErrorForResponses(
+        responses, response_count,
+        TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("PyTorch execute failure: " + std::string(ex.what())).c_str()));
+  }
+}
+
+float
+ModelInstanceState::GetCudaEventElapsedTime(
+    const cudaEvent_t& start_event, const cudaEvent_t& end_event)
+{
+  float duration = 0;
+#ifdef TRITON_ENABLE_GPU
+  // [FIXME] in the case of cudaEventElapsedTime failure, should handle
+  // stats reporting more gracefully as the durations are inaccurate
+  LOG_IF_ERROR(
+      ConvertCUDAStatusToTritonError(
+          cudaEventElapsedTime(&duration, start_event, end_event),
+          TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"),
+      "Failed to capture elapsed time");
+#endif
+  return duration;
+}
+
+
+cudaStream_t
+ModelInstanceState::GetCudaStreamByInstanceKind()
+{
+#ifdef TRITON_ENABLE_GPU
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+    return stream_;
+  } else if (
+      (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) &&
+      !stream_vec_.empty()) {
+    return stream_vec_[0];
+  }
+#endif
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::GetNamingConvention(
+    NamingConvention* naming_convention,
+    const std::vector<std::string>& allowed_ios)
+{
+  // Rules for (non-Dictionary) input tensor names:
+  // 1. Must be in 'allowed_inputs' (arguments in the forward function)
+  // 2. Must follow the naming convention i.e. <name>__<index>
+  // 3. If neither of the above conditions are satisfied, enforce strict
+  // ordering of model inputs.
+  //
+  // Rules for output tensor names:
+  // 1. Must follow the naming convention i.e. <name>__<index>
+  // 2. If not, we enforce strict ordering of model outputs.
+  std::string deliminator = "__";
+  std::string io_kind = "input";
+  *naming_convention = NamingConvention::FORWARD_ARGUMENT;
+
+  // symbolizes output
+  if (allowed_ios.size() == 0) {
+    io_kind = "output";
+    *naming_convention = NamingConvention::NAMED_INDEX;
+  }
+
+  triton::common::TritonJson::Value ios;
+  RETURN_IF_ERROR(
+      model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios));
+
+  if (io_kind == "input") {
+    for (size_t i = 0; i < ios.ArraySize(); i++) {
+      triton::common::TritonJson::Value io;
+      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+      // Validate name
+      std::string io_name;
+      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+      auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name);
+      if (itr == allowed_ios.end()) {
+        *naming_convention = NamingConvention::NAMED_INDEX;
+        break;
+      }
+    }
+  }
+
+  // If not, check if inputs follow INDEX
+  if (*naming_convention == NamingConvention::NAMED_INDEX) {
+    for (size_t i = 0; i < ios.ArraySize(); i++) {
+      triton::common::TritonJson::Value io;
+      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+      // Validate name
+      std::string io_name;
+      RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+      int start_pos = io_name.find(deliminator);
+      if (start_pos == -1) {
+        *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
+        break;
+      } else {
+        // check if the index part of the name is not an integer
+        std::string index_str = io_name.substr(start_pos + 2);
+        bool is_int = true;
+        for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+          if (std::isdigit(*itr) == 0) {
+            is_int = false;
+          }
+        }
+
+        if (!is_int) {
+          if (io_kind == "input") {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_WARN,
+                ("input '" + io_name +
+                 "' or previous input(s) are neither an input argument to the "
+                 "model '" +
+                 model_state_->Name() +
+                 "' nor do they follow the <name>__<index> naming convention. "
+                 "Falling back to enforcing strict ordering from model "
+                 "configuration.")
+                    .c_str());
+          } else {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_WARN,
+                ("output '" + io_name +
+                 "' or previous output(s) of the model '" +
+                 model_state_->Name() +
+                 "' do not follow the <name>__<index> naming convention. "
+                 "Falling back to enforcing strict ordering from model "
+                 "configuration.")
+                    .c_str());
+          }
+          *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING;
+          break;
+        }
+      }
+    }
+  }
+
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state_->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    // If we need to manage state for the model, then we need to check
+    // the naming of the state adheres to both the input and output conventions
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      if (*naming_convention != NamingConvention::NAMED_INDEX) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            ("PyTorch model '" + model_state_->Name() +
+             "' is using sequence batching with state but not all inputs and "
+             "outputs follow the <name>__<index> naming convention. ")
+                .c_str());
+      }
+    }
+
+    for (size_t i = 0; i < states.ArraySize(); i++) {
+      triton::common::TritonJson::Value state;
+      RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+      std::string name_entry =
+          io_kind == "input" ? "input_name" : "output_name";
+      std::string state_name;
+      RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name));
+      int start_pos = state_name.find(deliminator);
+      if (start_pos == -1) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            ("PyTorch model '" + model_state_->Name() +
+             "' is using sequence batching with state but state '" +
+             state_name +
+             "' does not follow the <name>__<index> naming convention. ")
+                .c_str());
+      } else {
+        // check if the index part of the name is not an integer
+        std::string index_str = state_name.substr(start_pos + 2);
+        bool is_int = true;
+        for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+          if (std::isdigit(*itr) == 0) {
+            is_int = false;
+          }
+        }
+        if (!is_int) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              ("PyTorch model '" + model_state_->Name() +
+               "' is using sequence batching with state but state '" +
+               state_name +
+               "' does not follow the <name>__<index> naming convention. ")
+                  .c_str());
+        }
+      }
+    }
+  }
+
+  return nullptr;  // success
+}
+
+void
+ModelInstanceState::ProcessRequests(
+    TRITONBACKEND_Request** requests, const uint32_t request_count)
+{
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " +
+       std::to_string(request_count) + " requests")
+          .c_str());
+
+#ifdef TRITON_ENABLE_GPU
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+    SetCurrentCudaStream(stream_, DeviceId());
+  } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+    // Replace the default stream of each device with the one we created.
+    for (size_t i = 0; i < stream_vec_.size(); i++) {
+      SetCurrentCudaStream(stream_vec_[i], i);
+    }
+  }
+#endif
+
+  NVTX_RANGE(nvtx_, "ProcessRequests " + Name());
+
+  uint64_t exec_start_ns = 0;
+  SET_TIMESTAMP(exec_start_ns);
+
+  const int max_batch_size = model_state_->MaxBatchSize();
+
+  // For each request collect the total batch size for this inference
+  // execution. The batch-size, number of inputs, and size of each
+  // input has already been checked so don't need to do that here.
+  size_t total_batch_size = 0;
+  for (size_t i = 0; i < request_count; i++) {
+    // If we get a nullptr request then something is badly wrong. Fail
+    // and release all requests.
+    if (requests[i] == nullptr) {
+      RequestsRespondWithError(
+          requests, request_count,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              std::string(
+                  "null request given to PyTorch backend for '" + Name() + "'")
+                  .c_str()));
+      return;
+    }
+  }
+
+  // At this point we are committed to running inference with all
+  // 'requests'. Create a response for each request. During input
+  // processing if there is an error with any request that error will
+  // be sent immediately with the corresponding response (and the
+  // response unique_ptr will then be nullptr). The request object
+  // itself will not be released until after all inferencing is done
+  // (below) as we may need to access the request object when
+  // determine how to process outputs (for example, even if we don't
+  // need the outputs for a request that has an error, we do need to
+  // know the size of those outputs associated with the request so we
+  // can skip them in the output tensors).
+  std::vector<TRITONBACKEND_Response*> responses;
+  responses.reserve(request_count);
+  bool all_response_failed = false;
+
+  for (size_t i = 0; i < request_count; i++) {
+    TRITONBACKEND_Response* response;
+    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
+    if (err == nullptr) {
+      responses.emplace_back(response);
+    } else {
+      responses.emplace_back(nullptr);
+      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
+      TRITONSERVER_ErrorDelete(err);
+    }
+  }
+
+  for (size_t i = 0; i < request_count; i++) {
+    if (max_batch_size > 0) {
+      // Retrieve the batch size from one of the inputs, if the model
+      // supports batching, the first dimension size is batch size.
+      TRITONBACKEND_Input* input;
+      TRITONSERVER_Error* err =
+          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
+      if (err == nullptr) {
+        const int64_t* shape;
+        err = TRITONBACKEND_InputProperties(
+            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+        total_batch_size += shape[0];
+      }
+      if (err != nullptr) {
+        RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+            responses, request_count, all_response_failed, err);
+      }
+    } else {
+      total_batch_size += 1;
+    }
+  }
+
+  // If there are no valid payloads then no need to run the inference.
+  if (total_batch_size == 0) {
+    return;
+  }
+
+  // Make sure the maximum batch size is not exceeded. The
+  // total_batch_size must be 1 for models that don't support batching
+  // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
+  // scheduler has done something badly wrong so fail and release all
+  // requests.
+  if (!all_response_failed) {
+    if ((total_batch_size != 1) &&
+        (total_batch_size > (size_t)max_batch_size)) {
+      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+          responses, request_count, all_response_failed,
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              std::string(
+                  "batch size " + std::to_string(total_batch_size) + " for '" +
+                  Name() + "', max allowed is " +
+                  std::to_string(max_batch_size))
+                  .c_str()));
+    }
+  }
+
+  std::vector<const char*> input_names;
+  std::vector<torch::jit::IValue> input_tensors;
+  bool cuda_copy = false;
+  std::unique_ptr<BackendInputCollector> collector;
+
+  // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute
+  // input duration since only one stream will be used for input collection.
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
+      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
+#ifdef TRITON_ENABLE_GPU
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        ConvertCUDAStatusToTritonError(
+            cudaEventRecord(
+                compute_input_start_event_, GetCudaStreamByInstanceKind()),
+            TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
+#endif
+  }
+
+  if (!all_response_failed) {
+    collector.reset(new BackendInputCollector(
+        requests, request_count, &responses,
+        model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(),
+        GetCudaStreamByInstanceKind(), nullptr, nullptr, 0,
+        HostPolicyName().c_str()));
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        SetInputTensors(
+            total_batch_size, requests, request_count, &responses,
+            collector.get(), &input_names, &input_tensors, &cuda_copy));
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(GetCudaStreamByInstanceKind());
+    cuda_copy = false;
+  }
+#endif
+
+  std::vector<torch::jit::IValue> output_tensors;
+  uint64_t compute_start_ns = 0;
+  uint64_t compute_infer_start = 0;
+
+  RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+      responses, request_count, all_response_failed,
+      RecordBackendTimestamp(
+          &compute_start_ns,
+          reinterpret_cast<void*>(&compute_infer_start_event_)));
+
+  // For 'KIND_MODEL', capture the timestamp for the compute infer duration.
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
+    SET_TIMESTAMP(compute_infer_start);
+  }
+
+  // Run...
+  if (!all_response_failed) {
+    Execute(&responses, request_count, &input_tensors, &output_tensors);
+  }
+
+  // Verify output indices are valid with number of outputs after execution
+  bool invalid_index = false;
+  int max_index = output_tensors.size() - 1;
+
+  if (!all_response_failed) {
+    for (const auto& name : model_state_->ModelOutputs()) {
+      int op_index = output_index_map_[name.first];
+      if ((op_index < 0) || (op_index > max_index)) {
+        RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+            responses, request_count, all_response_failed,
+            TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INVALID_ARG,
+                std::string(
+                    "The output " + std::string(name.first) +
+                    " in the model configuration refers to an output index "
+                    "which doesn't exist. This model has " +
+                    std::to_string(max_index + 1) + " outputs")
+                    .c_str()));
+        invalid_index = true;
+        break;
+      }
+    }
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+    // For 'KIND_MODEL', multiple streams will be involved, so we need to call
+    // 'cudaStreamSynchronize' before reading the output tensors.
+    for (auto& stream : stream_vec_) {
+      cudaStreamSynchronize(stream);
+    }
+  }
+#endif
+
+  uint64_t compute_end_ns = 0;
+  uint64_t compute_output_start = 0;
+
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
+#ifdef TRITON_ENABLE_GPU
+    SET_TIMESTAMP(compute_output_start);
+#endif
+  } else {
+    RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+        responses, request_count, all_response_failed,
+        RecordBackendTimestamp(
+            &compute_end_ns,
+            reinterpret_cast<void*>(&compute_output_start_event_)));
+  }
+
+  if (!all_response_failed) {
+    if (!invalid_index) {
+      RESPOND_ALL_AND_SET_TRUE_IF_ERROR(
+          responses, request_count, all_response_failed,
+          ReadOutputTensors(
+              total_batch_size, output_tensors, requests, request_count,
+              &responses));
+    }
+  }
+
+  uint64_t exec_end_ns = 0;
+  SET_TIMESTAMP(exec_end_ns);
+
+  // Send all the responses that haven't already been sent because of
+  // an earlier error. Note that the responses are not set to nullptr
+  // here as we need that indication below to determine if the request
+  // we successful or not.
+  for (auto& response : responses) {
+    if (response != nullptr) {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ResponseSend(
+              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
+          "failed to send PyTorch backend response");
+    }
+  }
+
+  // We don't need an explicit CUDA syncrhonization here since we have already
+  // synchronized the stream in the ReadOutputTensors function.
+  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    float compute_input_duration = GetCudaEventElapsedTime(
+        compute_input_start_event_, compute_infer_start_event_);
+    float compute_infer_duration = GetCudaEventElapsedTime(
+        compute_infer_start_event_, compute_output_start_event_);
+
+    compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
+    compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6);
+#endif
+  } else if (
+      (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) {
+#ifdef TRITON_ENABLE_GPU
+    float compute_input_duration = GetCudaEventElapsedTime(
+        compute_input_start_event_, compute_infer_start_event_);
+    uint64_t compute_infer_duration =
+        compute_output_start - compute_infer_start;
+
+    compute_start_ns = exec_start_ns + (compute_input_duration * 1e6);
+    compute_end_ns = compute_start_ns + compute_infer_duration;
+#endif
+  }
+
+  // Report statistics for each request.
+  for (uint32_t r = 0; r < request_count; ++r) {
+    auto& request = requests[r];
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportStatistics(
+            TritonModelInstance(), request,
+            (responses[r] != nullptr) /* success */, exec_start_ns,
+            compute_start_ns, compute_end_ns, exec_end_ns),
+        "failed reporting request statistics");
+
+    LOG_IF_ERROR(
+        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
+        "failed releasing request");
+  }
+
+  if (!all_response_failed) {
+    // Report the entire batch statistics.
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportBatchStatistics(
+            TritonModelInstance(), total_batch_size, exec_start_ns,
+            compute_start_ns, compute_end_ns, exec_end_ns),
+        "failed reporting batch request statistics");
+  }
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ReadOutputTensors(
+    size_t total_batch_size,
+    const std::vector<torch::jit::IValue>& output_tensors,
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    std::vector<TRITONBACKEND_Response*>* responses)
+{
+  NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name());
+
+  BackendOutputResponder responder(
+      requests, request_count, responses, model_state_->TritonMemoryManager(),
+      model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(),
+      GetCudaStreamByInstanceKind());
+
+  bool cuda_copy = false;
+  // The serialized string buffer must be valid until output copies are done
+  std::vector<std::unique_ptr<std::string>> string_buffer;
+  for (auto& output : model_state_->ModelOutputs()) {
+    int op_index = output_index_map_[output.first];
+    auto name = output.first;
+    auto output_tensor_pair = output.second;
+
+    if (output_tensors[op_index].isTensor()) {
+      torch::Tensor output_flat;
+      try {
+        output_flat =
+            output_tensors[op_index].toTensor().contiguous().flatten();
+      }
+      catch (std::exception& ex) {
+        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            (std::string("output tensor '") + name + "' is not found")
+                .c_str()));
+      }
+
+      // Verify output datatype matches datatype from model config
+      TRITONSERVER_DataType output_dtype =
+          ConvertTorchTypeToDataType(output_flat.scalar_type());
+      TRITONSERVER_DataType config_datatype = output_dtype_map_[name];
+      if (config_datatype != output_dtype) {
+        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string("configuration expects datatype TYPE_") +
+             TRITONSERVER_DataTypeString(config_datatype) + " for output '" +
+             name + "', model provides TYPE_" +
+             TRITONSERVER_DataTypeString(output_dtype))
+                .c_str()));
+      }
+
+      const char* output_buffer =
+          static_cast<const char*>(output_flat.data_ptr());
+
+      // Output tensors may not reside on the same device as model
+      torch::Device tensor_device = output_flat.device();
+      const auto memory_type = (tensor_device.type() == torch::kCPU)
+                                   ? TRITONSERVER_MEMORY_CPU
+                                   : TRITONSERVER_MEMORY_GPU;
+      const auto memory_id =
+          (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index();
+
+      // Batch output doesn't support string data type yet, as it is not trivial
+      // to parse string output
+      const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name);
+      if (batch_output == nullptr) {
+        // Get output shape
+        std::vector<int64_t> batchn_shape;
+        auto shape = output_tensors[op_index].toTensor().sizes();
+        for (auto itr = shape.begin(); itr != shape.end(); itr++) {
+          batchn_shape.push_back(*itr);
+        }
+
+        if (batchn_shape.size() == 0) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string("output '") + name +
+               "' is a scalar which is not supported.")
+                  .c_str());
+        }
+        if (output_tensor_pair.first != -1) {
+          responder.ProcessTensor(
+              name, output_dtype, batchn_shape, output_buffer, memory_type,
+              memory_id);
+        }
+        if (output_tensor_pair.second != -1) {
+          std::vector<TRITONBACKEND_State*> states;
+          states = responder.ProcessStateTensor(
+              name, output_dtype, batchn_shape, output_buffer, memory_type,
+              memory_id);
+          // Update the states
+          for (auto& state : states) {
+            RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state));
+          }
+        }
+
+      } else {
+        responder.ProcessBatchOutput(
+            name, *batch_output, output_buffer, memory_type, memory_id);
+      }
+    } else if (output_tensors[op_index].isList()) {
+      // Custom handling for string/bytes tensor...
+      torch::List<torch::jit::IValue> output_list =
+          output_tensors[op_index].toList();
+
+      // Get output shape
+      std::vector<int64_t> batchn_shape{(int64_t)output_list.size()};
+
+      for (size_t idx = 0; idx < responses->size(); idx++) {
+        auto& request = requests[idx];
+        auto& response = (*responses)[idx];
+
+        if (supports_batching_ != 0) {
+          TRITONBACKEND_Input* input;
+          TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input);
+          const int64_t* shape;
+          TRITONBACKEND_InputProperties(
+              input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+          batchn_shape[0] = shape[0];
+        }
+
+        int64_t tensor_element_cnt = 0;
+        RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt));
+
+        // Only need an response tensor for requested outputs.
+        if (response != nullptr) {
+          if (output_tensor_pair.first != -1) {
+            TRITONBACKEND_Output* response_output;
+            RESPOND_AND_SET_NULL_IF_ERROR(
+                &response, TRITONBACKEND_ResponseOutput(
+                               response, &response_output, name.c_str(),
+                               TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
+                               batchn_shape.size()));
+            string_buffer.emplace_back(new std::string());
+            cuda_copy |= SetStringOutputBuffer(
+                &output_list, &response, response_output, tensor_element_cnt,
+                GetCudaStreamByInstanceKind(), string_buffer.back().get());
+          }
+        }
+        if (output_tensor_pair.second != -1) {
+          TRITONBACKEND_State* response_state;
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &response, TRITONBACKEND_StateNew(
+                             &response_state, request, name.c_str(),
+                             TRITONSERVER_TYPE_BYTES, batchn_shape.data(),
+                             batchn_shape.size()));
+
+          string_buffer.emplace_back(new std::string());
+          cuda_copy |= SetStringStateBuffer(
+              &output_list, &response, response_state, tensor_element_cnt,
+              GetCudaStreamByInstanceKind(), string_buffer.back().get());
+        }
+      }
+    } else {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("output '") + name +
+           "' must be of type Tensor or List[str].")
+              .c_str());
+    }
+  }
+
+  // Finalize and wait for any pending buffer copies.
+  cuda_copy |= responder.Finalize();
+
+#ifdef TRITON_ENABLE_GPU
+  // We have to always synchronize the stream. This is to make sure that
+  // the events on the cuda stream are synchronized. Otherwise, the events
+  // are only guaranteed to be synchronized if the model provides the output
+  // on GPU.
+  cudaStreamSynchronize(GetCudaStreamByInstanceKind());
+#endif
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::RecordBackendTimestamp(
+    uint64_t* timestamp, void* cuda_event)
+{
+  if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
+      ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) {
+#ifdef TRITON_ENABLE_GPU
+    cudaEvent_t* lcuda_event = reinterpret_cast<cudaEvent_t*>(cuda_event);
+    RETURN_IF_ERROR(ConvertCUDAStatusToTritonError(
+        cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()),
+        TRITONSERVER_ERROR_INTERNAL, "Failed to record the event."));
+#endif
+  } else {
+    SET_TIMESTAMP(*timestamp);
+  }
+  return nullptr;
+}
+
+void
+ModelInstanceState::SetCurrentCudaStream(
+    const cudaStream_t& stream, const int& device_id)
+{
+#ifdef TRITON_ENABLE_GPU
+  at::cuda::CUDAStream torch_stream =
+      at::cuda::getStreamFromExternal(stream, device_id);
+  // This function replaces the default stream with the stream we created. It
+  // is not necessary to change the current device to the desired device when
+  // replacing the default stream for that device. See the documentation here:
+  // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html
+  at::cuda::setCurrentCUDAStream(torch_stream);
+#endif
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::SetInputTensors(
+    size_t total_batch_size, TRITONBACKEND_Request** requests,
+    const uint32_t request_count,
+    std::vector<TRITONBACKEND_Response*>* responses,
+    BackendInputCollector* collector, std::vector<const char*>* input_names,
+    std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy)
+{
+  // InferenceMode should be used to guard all tensors operations
+  torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
+
+  // All requests must have equally-sized input tensors so use any
+  // request as the representative for the input tensors.
+  uint32_t input_count;
+  RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count));
+
+  input_tensors->resize(input_count + batch_input_count_);
+
+  // The inputs must be in contiguous CPU/GPU memory.
+  std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
+  if (device_.is_cpu()) {
+    alloc_perference = {
+        {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
+  } else {
+    alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
+  }
+
+  for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
+    TRITONBACKEND_Input* input;
+    RETURN_IF_ERROR(
+        TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
+
+    const char* input_name;
+    TRITONSERVER_DataType input_datatype;
+    const int64_t* input_shape;
+    uint32_t input_dims_count;
+    RETURN_IF_ERROR(TRITONBACKEND_InputProperties(
+        input, &input_name, &input_datatype, &input_shape, &input_dims_count,
+        nullptr, nullptr));
+
+    input_names->emplace_back(input_name);
+
+    // The shape for the entire input patch,
+    // [total_batch_size, ...] for non-ragged input and
+    // [total_element_count] for ragged input (non-nested tensor)
+    std::vector<int64_t> batchn_shape;
+    if (StateForModel()->IsInputRagged(input_name)) {
+      batchn_shape = std::vector<int64_t>{0};
+      for (size_t idx = 0; idx < request_count; idx++) {
+        TRITONBACKEND_Input* input;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
+        const int64_t* input_shape;
+        uint32_t input_dims_count;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]), TRITONBACKEND_InputProperties(
+                                      input, nullptr, nullptr, &input_shape,
+                                      &input_dims_count, nullptr, nullptr));
+
+        int64_t element_cnt = 0;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            GetElementCount(input_shape, input_dims_count, &element_cnt));
+        batchn_shape[0] += element_cnt;
+      }
+    } else {
+      batchn_shape =
+          std::vector<int64_t>(input_shape, input_shape + input_dims_count);
+      if (supports_batching_) {
+        batchn_shape[0] = total_batch_size;
+      }
+    }
+
+    // The input must be in contiguous CPU/GPU memory.
+    std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
+    // For 'KIND_MODEL', input will always be in CPU as we don't have a way to
+    // query the input types.
+    if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) {
+      alloc_perference = {
+          {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
+    } else {
+      alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
+    }
+
+    const char* input_buffer;
+    size_t batchn_byte_size;
+    TRITONSERVER_MemoryType memory_type;
+    int64_t memory_type_id;
+    RETURN_IF_ERROR(collector->ProcessTensor(
+        input_name, nullptr, 0, alloc_perference, &input_buffer,
+        &batchn_byte_size, &memory_type, &memory_type_id));
+
+    // Create Torch tensor
+    const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype);
+    torch::TensorOptions options{torch_dtype.second};
+    auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU)
+                               ? options.device(torch::kCUDA, device_.index())
+                               : options.device(torch::kCPU);
+
+    if (input_datatype == TRITONSERVER_TYPE_BYTES) {
+      // Create the PyTorch list to hold the strings.
+      torch::List<std::string> input_list;
+      input_list.reserve(batchn_shape[0]);
+
+      for (size_t idx = 0; idx < request_count; idx++) {
+        TRITONBACKEND_Input* input;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            TRITONBACKEND_RequestInput(requests[idx], input_name, &input));
+        const int64_t* shape;
+        uint32_t dims_count;
+        uint32_t buffer_count;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            TRITONBACKEND_InputPropertiesForHostPolicy(
+                input, HostPolicyName().c_str(), nullptr, nullptr, &shape,
+                &dims_count, nullptr, &buffer_count));
+
+        int64_t batch_element_cnt = 0;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &((*responses)[idx]),
+            GetElementCount(shape, dims_count, &batch_element_cnt));
+
+        *cuda_copy |= SetStringInputTensor(
+            &input_list, input, input_name, buffer_count, batch_element_cnt,
+            &((*responses)[idx]), GetCudaStreamByInstanceKind(),
+            HostPolicyName().c_str());
+      }
+
+      (*input_tensors)[input_index_map_[input_name]] = input_list;
+    } else {
+      if (batchn_byte_size) {
+        // Remove constness to align with the signature of torch::from_blob()
+        torch::Tensor input_tensor = torch::from_blob(
+            const_cast<char*>(input_buffer), batchn_shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      } else {
+        // torch:from_blob seems not working when the input size is 0
+        // create zero-length inputs directly
+        torch::Tensor input_tensor =
+            torch::zeros(batchn_shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      }
+    }
+  }
+
+  for (const auto& batch_input : StateForModel()->BatchInputs()) {
+    std::vector<int64_t> shape;
+    collector->BatchInputShape(batch_input, &shape);
+
+    for (const auto& input_name : batch_input.TargetNames()) {
+      input_names->emplace_back(input_name.c_str());
+
+      const char* dst_buffer;
+      size_t dst_buffer_byte_size;
+      TRITONSERVER_MemoryType dst_memory_type;
+      int64_t dst_memory_type_id;
+
+      RESPOND_ALL_AND_SET_NULL_IF_ERROR(
+          (*responses), responses->size(),
+          collector->ProcessBatchInput(
+              batch_input, nullptr, 0, alloc_perference, &dst_buffer,
+              &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id));
+
+      const auto torch_dtype =
+          ConvertDataTypeToTorchType(batch_input.DataType());
+      torch::TensorOptions options{torch_dtype.second};
+      auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU)
+                                 ? options.device(torch::kCUDA, device_.index())
+                                 : options.device(torch::kCPU);
+
+      if (dst_buffer_byte_size) {
+        torch::Tensor input_tensor = torch::from_blob(
+            const_cast<char*>(dst_buffer), shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      } else {
+        // special handle when input has zero size
+        torch::Tensor input_tensor = torch::zeros(shape, updated_options);
+        (*input_tensors)[input_index_map_[input_name]] = input_tensor;
+      }
+    }
+  }
+
+  // Finalize...
+  *cuda_copy |= collector->Finalize();
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateBooleanSequenceControl(
+    triton::common::TritonJson::Value& sequence_batching,
+    const std::string& control_kind, bool required, bool* have_control)
+{
+  std::string tensor_name;
+  std::string tensor_datatype;
+  RETURN_IF_ERROR(GetBooleanSequenceControlProperties(
+      sequence_batching, model_state_->Name(), control_kind, required,
+      &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr,
+      nullptr, nullptr));
+  *have_control = !tensor_name.empty();
+  if (*have_control) {
+    std::string deliminator = "__";
+    int ip_index = 0;
+    int start_pos = tensor_name.find(deliminator);
+    if (start_pos == -1) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("input '" + tensor_name +
+           "' does not follow <name>__<index> naming convention.")
+              .c_str());
+    }
+
+    // check if the index part of the name is not an integer
+    std::string index_str = tensor_name.substr(start_pos + 2);
+    for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+      if (std::isdigit(*itr) == 0) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("input '" + tensor_name +
+             "' does not follow <name>__<index> naming convention.")
+                .c_str());
+      }
+    }
+
+    ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
+    input_index_map_[tensor_name] = ip_index;
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
+{
+  // Collect all the expected input tensor names and validate that the model
+  // configuration specifies only those.
+  std::vector<std::string> allowed_inputs;
+
+  const torch::jit::Method& method = torch_model_->get_method("forward");
+  const auto& schema = method.function().getSchema();
+  const std::vector<c10::Argument>& arguments = schema.arguments();
+
+  // Currently, only models with a single input of type Dict(str, Tensor) are
+  // supported. If the model expects more than one input then they must be all
+  // be of type Tensor.
+  //
+  // Ignore the argument at idx 0 if it is of Class type (self param in forward
+  // function)
+  size_t start_idx = 0;
+  if ((arguments.size() > 0) &&
+      (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) {
+    start_idx = 1;
+  }
+  if ((arguments.size() == (1 + start_idx)) &&
+      (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) {
+    is_dict_input_ = true;
+  } else if (arguments.size() > start_idx) {
+    // Return error if multiple inputs are of kind DictType
+    for (size_t i = start_idx + 1; i < arguments.size(); i++) {
+      if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            "Multiple inputs of kind DictType were detected. Only a single "
+            "input of type Dict(str, Tensor) is supported.");
+      }
+    }
+
+    // Return error if all inputs are not of type Tensor
+    for (size_t i = start_idx; i < arguments.size(); i++) {
+      if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) &&
+          (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            (std::string("An input of type '") + arguments.at(i).type()->str() +
+             "' was detected in the model. Only a single input of type "
+             "Dict(str, Tensor) or input(s) of type Tensor are supported.")
+                .c_str());
+      }
+      allowed_inputs.emplace_back(arguments.at(i).name());
+    }
+
+    // If all inputs are tensors, match number of expected inputs between model
+    // and configuration
+    if ((arguments.size() - start_idx) != expected_input_cnt) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string("unable to load model '") + model_state_->Name() +
+           "', configuration expects " + std::to_string(expected_input_cnt) +
+           " inputs, model provides " +
+           std::to_string(arguments.size() - start_idx))
+              .c_str());
+    }
+  }
+
+  triton::common::TritonJson::Value ios;
+  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios));
+
+  if (ios.ArraySize() == 0) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "model configuration must contain at least one input, none were "
+        "specified.");
+  }
+
+  NamingConvention naming_convention;
+  RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs));
+
+  for (size_t i = 0; i < ios.ArraySize(); i++) {
+    triton::common::TritonJson::Value io;
+    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+    // Validate name
+    std::string io_name;
+    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+    AddInputToMap(naming_convention, allowed_inputs, io_name, i);
+    // Validate data type
+    std::string io_dtype;
+    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
+    const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
+    if (!pr.first && (io_dtype != "TYPE_STRING")) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("unsupported datatype " + io_dtype + " for input '" + io_name +
+           "' for model '" + model_state_->Name() + "'")
+              .c_str());
+    }
+
+    // Validate shape for String inputs. Only allow 1 dimension.
+    if (io_dtype == "TYPE_STRING") {
+      // If a reshape is provided for the input then use that when
+      // validating the model shapes.
+      std::vector<int64_t> dims;
+      triton::common::TritonJson::Value reshape;
+      if (io.Find("reshape", &reshape)) {
+        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
+      } else {
+        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
+      }
+
+      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("Triton only supports 1 dimensional List of String as input for "
+             "'" +
+             std::string(io_name) + "' for model '" + model_state_->Name() +
+             "'")
+                .c_str());
+      }
+    }
+  }
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state_->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      for (size_t i = 0; i < states.ArraySize(); i++) {
+        triton::common::TritonJson::Value state;
+        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+        std::string state_name;
+        RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name));
+        AddInputToMap(naming_convention, allowed_inputs, state_name, i);
+
+        // Validate data type
+        std::string state_dtype;
+        RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
+        const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
+        if (!pr.first && (state_dtype != "TYPE_STRING")) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              ("unsupported datatype " + state_dtype + " for input state '" +
+               state_name + "' for model '" + model_state_->Name() + "'")
+                  .c_str());
+        }
+
+        // Validate shape for String inputs. Only allow 1 dimension.
+        if (state_dtype == "TYPE_STRING") {
+          std::vector<int64_t> dims;
+          if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+            return TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INTERNAL,
+                ("Triton only supports 1 dimensional List of String as input "
+                 "for "
+                 "'" +
+                 std::string(state_name) + "' for model '" +
+                 model_state_->Name() + "'")
+                    .c_str());
+          }
+        }
+      }
+    }
+  }
+
+  triton::common::TritonJson::Value batch_inputs;
+  RETURN_IF_ERROR(
+      model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs));
+  size_t i = 0;
+  for (const auto& batch_input : StateForModel()->BatchInputs()) {
+    for (const auto& input_name : batch_input.TargetNames()) {
+      AddInputToMap(
+          naming_convention, allowed_inputs, input_name, i + ios.ArraySize());
+      i++;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateOutputs()
+{
+  triton::common::TritonJson::Value ios;
+  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
+  std::string deliminator = "__";
+  int op_index = 0;
+
+  if (ios.ArraySize() == 0) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "model configuration must contain at least one output, none were "
+        "specified.");
+  }
+
+  NamingConvention naming_convention;
+  RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {}));
+
+  for (size_t i = 0; i < ios.ArraySize(); i++) {
+    triton::common::TritonJson::Value io;
+    RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+
+    // Validate name
+    std::string io_name;
+    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+    switch (naming_convention) {
+      case NamingConvention::NAMED_INDEX: {
+        int start_pos = io_name.find(deliminator);
+        op_index = std::atoi(io_name.substr(start_pos + 2).c_str());
+        break;
+      }
+      case NamingConvention::STRICT_CONFIG_ORDERING: {
+        op_index = i;
+        break;
+      }
+      default:
+        break;
+    }
+
+    // Validate data type
+    std::string io_dtype;
+    RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
+    const auto pr = ModelConfigDataTypeToTorchType(io_dtype);
+    if (!pr.first && (io_dtype != "TYPE_STRING")) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("unsupported datatype " + io_dtype + " for output '" + io_name +
+           "' for model '" + model_state_->Name() + "'")
+              .c_str());
+    }
+
+    // Validate shape for String outputs. Only allow 1 dimension.
+    if (io_dtype == "TYPE_STRING") {
+      // If a reshape is provided for the output then use that when
+      // validating the model shapes.
+      std::vector<int64_t> dims;
+      triton::common::TritonJson::Value reshape;
+      if (io.Find("reshape", &reshape)) {
+        RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims));
+      } else {
+        RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
+      }
+
+      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("Triton only supports 1 dimensional List of String as output for "
+             "'" +
+             std::string(io_name) + "' for model '" + model_state_->Name() +
+             "'")
+                .c_str());
+      }
+    }
+
+    output_index_map_[io_name] = op_index;
+    output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second);
+  }
+
+  triton::common::TritonJson::Value sequence_batching;
+  if (model_state_->ModelConfig().Find(
+          "sequence_batching", &sequence_batching)) {
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      for (size_t i = 0; i < states.ArraySize(); i++) {
+        triton::common::TritonJson::Value state;
+        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+        std::string state_name;
+        RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name));
+        std::string state_dtype;
+        RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype));
+        std::vector<int64_t> dims;
+        RETURN_IF_ERROR(ParseShape(state, "dims", &dims));
+
+        // For state, naming convention is enforced to be NAMED_INDEX
+        int start_pos = state_name.find(deliminator);
+        op_index = std::atoi(state_name.substr(start_pos + 2).c_str());
+
+        const auto pr = ModelConfigDataTypeToTorchType(state_dtype);
+        if (!pr.first && (state_dtype != "TYPE_STRING")) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              ("unsupported datatype " + state_dtype + " for state '" +
+               state_name + "' for model '" + model_state_->Name() + "'")
+                  .c_str());
+        }
+
+        // Validate shape for String outputs. Only allow 1 dimension.
+        if (state_dtype == "TYPE_STRING") {
+          if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
+            return TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INTERNAL,
+                ("Triton only supports 1 dimensional List of String as output "
+                 "for "
+                 "'" +
+                 std::string(state_name) + "' for model '" +
+                 model_state_->Name() + "'")
+                    .c_str());
+          }
+        }
+
+        output_index_map_[state_name] = op_index;
+        output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second);
+      }
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ValidateTypedSequenceControl(
+    triton::common::TritonJson::Value& sequence_batching,
+    const std::string& control_kind, bool required, bool* have_control)
+{
+  std::string tensor_name;
+  std::string tensor_datatype;
+  RETURN_IF_ERROR(GetTypedSequenceControlProperties(
+      sequence_batching, model_state_->Name(), control_kind, required,
+      &tensor_name, &tensor_datatype));
+  *have_control = !tensor_name.empty();
+  if (*have_control) {
+    std::string deliminator = "__";
+    int ip_index = 0;
+    int start_pos = tensor_name.find(deliminator);
+    if (start_pos == -1) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("input '" + tensor_name +
+           "' does not follow <name>__<index> naming convention.")
+              .c_str());
+    }
+
+    // check if the index part of the name is not an integer
+    std::string index_str = tensor_name.substr(start_pos + 2);
+    for (auto itr = index_str.begin(); itr != index_str.end(); itr++) {
+      if (std::isdigit(*itr) == 0) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            ("input '" + tensor_name +
+             "' does not follow <name>__<index> naming convention.")
+                .c_str());
+      }
+    }
+
+    // check if the data type is supported by PyTorch
+    if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          ("input '" + tensor_name + "' type '" + tensor_datatype +
+           "' is not supported by PyTorch.")
+              .c_str());
+    }
+
+    ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str());
+    input_index_map_[tensor_name] = ip_index;
+  }
+
+  return nullptr;  // success
+}
+
+
+}  // namespace triton::backend::pytorch
diff --git a/src/model_instance_state.hh b/src/model_instance_state.hh
new file mode 100644
index 0000000..b495510
--- /dev/null
+++ b/src/model_instance_state.hh
@@ -0,0 +1,178 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdint>
+#include <exception>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "libtorch_utils.h"
+#include "model_state.hh"
+#include "naming_convention.hh"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/common/nvtx.h"
+#include "triton/core/tritonbackend.h"
+
+
+namespace triton::backend::pytorch {
+
+//
+// ModelInstanceState
+//
+// State associated with a model instance. An object of this class is
+// created and associated with each TRITONBACKEND_ModelInstance.
+//
+class ModelInstanceState : public BackendModelInstance {
+ private:
+  ModelState* model_state_;
+
+  // The full path to the TorchScript model file.
+  std::string model_path_;
+
+  std::shared_ptr<torch::jit::script::Module> torch_model_;
+  torch::Device device_;
+
+  // Map from configuration name for an input to the index of
+  // that input in the model.
+  std::unordered_map<std::string, int> input_index_map_;
+  uint32_t batch_input_count_ = 0;
+
+  // Map from configuration name for an output to the index of
+  // that output in the model.
+  std::unordered_map<std::string, int> output_index_map_;
+  std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
+
+  // If the input to the tensor is a dictionary of tensors.
+  bool is_dict_input_;
+
+  // If the model supports batching.
+  bool supports_batching_;
+
+  cudaEvent_t compute_input_start_event_;
+  cudaEvent_t compute_infer_start_event_;
+  cudaEvent_t compute_output_start_event_;
+
+  // Store the cuda streams created for the 'KIND_MODEL' instance group.
+  std::vector<cudaStream_t> stream_vec_;
+
+  // The number of available devices.
+  int device_cnt_;
+
+ public:
+  virtual ~ModelInstanceState();
+
+  // Clear CUDA cache
+  void ClearCache();
+
+  static TRITONSERVER_Error* Create(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance,
+      ModelInstanceState** state);
+
+  // Execute...
+  void ProcessRequests(
+      TRITONBACKEND_Request** requests, const uint32_t request_count);
+
+  // Get the state of the model that corresponds to this instance.
+  ModelState* StateForModel() const;
+
+ private:
+  ModelInstanceState(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance);
+
+  void AddInputToMap(
+      NamingConvention naming_convention,
+      const std::vector<std::string> allowed_inputs, const std::string& io_name,
+      const uint32_t index);
+
+  // Create CUDA events for statistics collection.
+  void CreateCudaEvents(const int32_t& device_id);
+
+  void Execute(
+      std::vector<TRITONBACKEND_Response*>* responses,
+      const uint32_t response_count,
+      std::vector<torch::jit::IValue>* input_tensors,
+      std::vector<torch::jit::IValue>* output_tensors);
+
+  // Get the elapsed time between two CUDA events.
+  float GetCudaEventElapsedTime(
+      const cudaEvent_t& start_event, const cudaEvent_t& end_event);
+
+  // Get the appropriate CUDA stream for input and output handling based on
+  // the instance group type.
+  cudaStream_t GetCudaStreamByInstanceKind();
+
+  // Get the naming convention for inputs/outputs from the model configuration
+  TRITONSERVER_Error* GetNamingConvention(
+      NamingConvention* naming_convention,
+      const std::vector<std::string>& allowed_io);
+
+  TRITONSERVER_Error* ReadOutputTensors(
+      size_t total_batch_size,
+      const std::vector<torch::jit::IValue>& output_tensors,
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses);
+
+  TRITONSERVER_Error* RecordBackendTimestamp(
+      uint64_t* timestamp, void* cuda_event);
+
+  // Replace the default CUDA stream with the stream we created to ensure
+  // proper cuda stream synchronization.
+  void SetCurrentCudaStream(
+      const cudaStream_t& stream, const int32_t& device_id);
+
+  TRITONSERVER_Error* SetInputTensors(
+      size_t total_batch_size, TRITONBACKEND_Request** requests,
+      const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses,
+      BackendInputCollector* collector, std::vector<const char*>* input_names,
+      std::vector<torch::jit::IValue>* input_tensors, bool* cuda_copy);
+
+  TRITONSERVER_Error* ValidateBooleanSequenceControl(
+      triton::common::TritonJson::Value& sequence_batching,
+      const std::string& control_kind, bool required, bool* have_control);
+
+  TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt);
+
+  TRITONSERVER_Error* ValidateOutputs();
+
+  TRITONSERVER_Error* ValidateTypedSequenceControl(
+      triton::common::TritonJson::Value& sequence_batching,
+      const std::string& control_kind, bool required, bool* have_control);
+};
+
+}  // namespace triton::backend::pytorch
diff --git a/src/model_state.cc b/src/model_state.cc
new file mode 100644
index 0000000..b007438
--- /dev/null
+++ b/src/model_state.cc
@@ -0,0 +1,495 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "model_state.hh"
+
+#include <mutex>
+
+
+namespace {
+std::once_flag pytorch_interop_threads_flag;
+std::once_flag pytorch_intraop_threads_flag;
+}  // namespace
+
+namespace triton::backend::pytorch {
+
+ModelState::ModelState(TRITONBACKEND_Model* triton_model)
+    : BackendModel(triton_model), enable_optimized_execution_(true),
+      enable_inference_mode_(true), enable_cudnn_(true),
+      enable_cache_cleaning_(false), enable_weight_sharing_(false),
+      enable_tensor_fuser_pair_({false, true}),
+      enable_jit_profiling_pair_({false, true}),
+      enable_jit_executor_pair_({false, true})
+{
+}
+
+TRITONSERVER_Error*
+ModelState::AutoCompleteConfig()
+{
+  // Auto-complete configuration is not supported since PyTorch does not
+  // store/capture sufficient model metadata so just log error instead.
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_WARN,
+      (std::string("skipping model configuration auto-complete for '") +
+       Name() + "': not supported for pytorch backend")
+          .c_str());
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+{
+  try {
+    *state = new ModelState(triton_model);
+  }
+  catch (const BackendModelException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  // Auto-complete the configuration if requested...
+  bool auto_complete_config = false;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(
+      triton_model, &auto_complete_config));
+  if (auto_complete_config) {
+    RETURN_IF_ERROR((*state)->AutoCompleteConfig());
+    RETURN_IF_ERROR((*state)->SetModelConfig());
+  }
+
+  auto& model_outputs = (*state)->model_outputs_;
+  // Parse the output states in the model configuration
+  triton::common::TritonJson::Value sequence_batching;
+  if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) {
+    triton::common::TritonJson::Value states;
+    if (sequence_batching.Find("state", &states)) {
+      for (size_t i = 0; i < states.ArraySize(); i++) {
+        triton::common::TritonJson::Value state;
+        RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+        std::string output_state_name;
+        RETURN_IF_ERROR(
+            state.MemberAsString("output_name", &output_state_name));
+        auto it = model_outputs.find(output_state_name);
+        if (it == model_outputs.end()) {
+          model_outputs.insert({output_state_name, std::make_pair(-1, i)});
+        } else {
+          it->second.second = i;
+        }
+      }
+    }
+  }
+
+  // Parse the output names in the model configuration
+  triton::common::TritonJson::Value outputs;
+  RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs));
+  for (size_t i = 0; i < outputs.ArraySize(); i++) {
+    triton::common::TritonJson::Value output;
+    THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output));
+
+    // Use names from ModelConfig by reference since the model
+    // config will persist longer than this inference execution.
+    std::string output_name;
+    THROW_IF_BACKEND_INSTANCE_ERROR(
+        output.MemberAsString("name", &output_name));
+
+    auto it = model_outputs.find(output_name);
+    if (it == model_outputs.end()) {
+      model_outputs.insert({output_name, std::make_pair(i, -1)});
+    } else {
+      it->second.first = i;
+    }
+  }
+
+  RETURN_IF_ERROR((*state)->ParseParameters());
+
+  return nullptr;  // success
+}
+
+bool
+ModelState::EnabledCacheCleaning()
+{
+  return enable_cache_cleaning_;
+}
+
+bool
+ModelState::EnabledCudnn()
+{
+  return enable_cudnn_;
+}
+
+bool
+ModelState::EnabledInferenceMode()
+{
+  return enable_inference_mode_;
+}
+
+const std::pair<bool, bool>&
+ModelState::EnabledJitExecutor() const
+{
+  return enable_jit_executor_pair_;
+}
+
+const std::pair<bool, bool>&
+ModelState::EnabledJitProfiling() const
+{
+  return enable_jit_profiling_pair_;
+}
+
+bool
+ModelState::EnabledOptimizedExecution()
+{
+  return enable_optimized_execution_;
+}
+
+const std::pair<bool, bool>&
+ModelState::EnabledTensorExprFuser() const
+{
+  return enable_tensor_fuser_pair_;
+}
+
+bool
+ModelState::EnabledWeightSharing()
+{
+  return enable_weight_sharing_;
+}
+
+TRITONSERVER_Error*
+ModelState::LoadModel(
+    const std::string& artifact_name, const torch::Device device,
+    std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
+    std::shared_ptr<torch::jit::script::Module>* torch_model)
+{
+  // Find the TorchScript file that describes the model. If the model
+  // configuration doesn't have an explicit model file specified then
+  // use the default name ("model.pt").
+  std::string cc_model_filename = artifact_name;
+  if (cc_model_filename.empty()) {
+    cc_model_filename = "model.pt";
+  }
+
+  *model_path = JoinPath(
+      {RepositoryPath(), std::to_string(Version()), cc_model_filename});
+
+  {
+    bool exists;
+    RETURN_IF_ERROR(FileExists(*model_path, &exists));
+    RETURN_ERROR_IF_FALSE(
+        exists, TRITONSERVER_ERROR_UNAVAILABLE,
+        std::string("unable to find '") + *model_path +
+            "' for model instance '" + Name() + "'");
+  }
+
+  // If weight sharing is enabled, skip loading model if
+  // it is already available on the target device
+  std::pair<bool, int> device_pair;
+  if (enable_weight_sharing_) {
+    device_pair = std::make_pair(!device.is_cpu(), device.index());
+    auto mit = torch_models_.find(device_pair);
+    if (mit != torch_models_.end()) {
+      *torch_model = mit->second;
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Reusing TorchScript model for instance '") + Name() +
+           "'")
+              .c_str());
+      return nullptr;  // success
+    }
+  }
+
+  // Serialize the torch model to string
+  std::string model_data_str;
+  RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str));
+
+  // InferenceMode should be used to guard all tensors operations including
+  // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html
+  torch::InferenceMode infer_guard(EnabledInferenceMode());
+
+  try {
+    std::istringstream model_stream(model_data_str);
+    if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) {
+      // Load the model without selecting a device.
+      torch_model->reset(
+          new torch::jit::Module(torch::jit::load(model_stream)));
+    } else {
+      torch_model->reset(
+          new torch::jit::Module(torch::jit::load(model_stream, device)));
+    }
+  }
+  catch (const std::exception& ex) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        ("failed to load model '" + Name() + "': " + ex.what()).c_str());
+  }
+
+  if (enable_weight_sharing_) {
+    if (!((torch_models_.emplace(device_pair, *torch_model)).second)) {
+      std::string type = device.is_cpu() ? "CPU" : "GPU";
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_WARN,
+          (std::string("Model already found on target ") + type + " device " +
+           "(id " + std::to_string(device.index()) + ") for '" + Name() + "'")
+              .c_str());
+    }
+  }
+
+  return nullptr;  // success
+}
+
+const std::map<std::string, std::pair<int64_t, int64_t>>&
+ModelState::ModelOutputs()
+{
+  return model_outputs_;
+}
+
+TRITONSERVER_Error*
+ModelState::ParseParameters()
+{
+  triton::common::TritonJson::Value params;
+  bool status = model_config_.Find("parameters", &params);
+  if (status) {
+    // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no
+    // update is made to 'enable_optimized_execution_'.
+    bool disable_optimized_execution = false;
+    TRITONSERVER_Error* err = ParseParameter(
+        params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+    enable_optimized_execution_ = !disable_optimized_execution;
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("Optimized execution is ") +
+         (enable_optimized_execution_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
+    // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then
+    // no update is made to 'enable_cache_cleaning_'.
+    err = ParseParameter(
+        params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("Cache Cleaning is ") +
+         (enable_cache_cleaning_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
+    // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made
+    // to 'enable_inference_mode_'.
+    err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("Inference Mode is ") +
+         (enable_inference_mode_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
+    // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made
+    // to 'enable_cudnn_'.
+    bool disable_cudnn = false;
+    err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+    enable_cudnn_ = !disable_cudnn;
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
+    // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no
+    // update is made to 'enable_tensor_fuser'.
+    bool enable_tensor_fuser = false;
+    err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      enable_tensor_fuser_pair_ = {true, enable_tensor_fuser};
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Tensor fuser is ") +
+           (enable_tensor_fuser ? "enabled" : "disabled") +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
+
+    // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no
+    // update is made to 'enable_weight_sharing'.
+    err = ParseParameter(
+        params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Weight sharing is ") +
+           (enable_weight_sharing_ ? "enabled" : "disabled") +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
+
+    // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update
+    // is made to 'enable_jit_profiling'.
+    bool enable_jit_profiling = false;
+    err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      enable_jit_profiling_pair_ = {true, enable_jit_profiling};
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Jit profiling is ") +
+           (enable_jit_profiling ? "enabled" : "disabled") +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
+
+    // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is
+    // made to 'enable_jit_executor'.
+    bool enable_jit_executor = false;
+    err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      enable_jit_executor_pair_ = {true, enable_jit_executor};
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Jit executor is ") +
+           (enable_jit_executor ? "enabled" : "disabled") +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
+
+    // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update
+    // is made to 'intra_op_thread_count', which by default will take all
+    // threads
+    int intra_op_thread_count = -1;
+    err =
+        ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      if (intra_op_thread_count > 0) {
+        // at::set_num_threads() does not throw if called more than once, but
+        // issues warnings. std::call_once() is useful to limit these.
+        std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() {
+          at::set_num_threads(intra_op_thread_count);
+        });
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("Intra op thread count is set to ") +
+             std::to_string(at::get_num_threads()) + " for model instance '" +
+             Name() + "'")
+                .c_str());
+      }
+    }
+
+    // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update
+    // is made to 'inter_op_thread_count', which by default will take all
+    // threads
+    int inter_op_thread_count = -1;
+    err =
+        ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      if (inter_op_thread_count > 0) {
+        // at::set_num_interop_threads() throws if called more than once.
+        // std::call_once() should prevent this, but try/catch is additionally
+        // used for safety.
+        std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() {
+          try {
+            at::set_num_interop_threads(inter_op_thread_count);
+          }
+          catch (const c10::Error& e) {
+            // do nothing
+          }
+        });
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("Inter op thread count is set to ") +
+             std::to_string(at::get_num_interop_threads()) +
+             " for model instance '" + Name() + "'")
+                .c_str());
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+}  // namespace triton::backend::pytorch
diff --git a/src/model_state.hh b/src/model_state.hh
new file mode 100644
index 0000000..1a404b8
--- /dev/null
+++ b/src/model_state.hh
@@ -0,0 +1,131 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdint>
+#include <exception>
+#include <mutex>
+
+#include "libtorch_utils.h"
+#include "naming_convention.hh"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/common/nvtx.h"
+#include "triton/core/tritonbackend.h"
+
+// for thread control
+// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api
+// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
+#include <ATen/Parallel.h>
+
+
+namespace triton::backend::pytorch {
+
+class ModelState : public triton::backend::BackendModel {
+ private:
+  // Flag to indicate whether optimized execution is enabled. Defaults to true.
+  bool enable_optimized_execution_;
+
+  // Flag to indicate whether inference mode is enabled. Defaults to false.
+  bool enable_inference_mode_;
+
+  // Flag to indicate whether cudnn is enabled. Defaults to true.
+  bool enable_cudnn_;
+
+  // Flag to indicate whether cache cleaning after each run is enabled.
+  // Defaults to false.
+  bool enable_cache_cleaning_;
+
+  // Flag to indicate whether weight sharing is enabled. Defaults to false.
+  bool enable_weight_sharing_;
+
+  // Flag pairs to indicate if various JIT settings are set and
+  // enabled respectively. Defaults to (false, true). Default behavior
+  // is to do nothing if not explicitly set.
+  std::pair<bool, bool> enable_tensor_fuser_pair_;
+  std::pair<bool, bool> enable_jit_profiling_pair_;
+  std::pair<bool, bool> enable_jit_executor_pair_;
+
+  // Model mapping for shared TorchScript model across all instances on the
+  // same device. The key is a pair of isGPU and device index.
+  std::map<
+      std::pair<bool, int64_t>, std::shared_ptr<torch::jit::script::Module>>
+      torch_models_;
+
+  // model_outputs is a map that contains unique outputs that the model must
+  // provide. The first pair is the model output index and the second is
+  // the index in the model state, -1 is used if one is not required.
+  // In the model configuration, the output in the state configuration
+  // can have intersection with the outputs section of the model. If an output
+  // is specified both in the output section and state section, it indicates
+  // that the backend must return the output state to the client too.
+  std::map<std::string, std::pair<int64_t, int64_t>> model_outputs_;
+
+ public:
+  virtual ~ModelState() = default;
+
+  static TRITONSERVER_Error* Create(
+      TRITONBACKEND_Model* triton_model, ModelState** state);
+
+  bool EnabledCacheCleaning();
+
+  bool EnabledCudnn();
+
+  bool EnabledInferenceMode();
+
+  const std::pair<bool, bool>& EnabledJitExecutor() const;
+
+  const std::pair<bool, bool>& EnabledJitProfiling() const;
+
+  bool EnabledOptimizedExecution();
+
+  const std::pair<bool, bool>& EnabledTensorExprFuser() const;
+
+  bool EnabledWeightSharing();
+
+  TRITONSERVER_Error* LoadModel(
+      const std::string& artifact_name, const torch::Device device,
+      std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind,
+      std::shared_ptr<torch::jit::script::Module>* torch_model);
+
+  const std::map<std::string, std::pair<int64_t, int64_t>>& ModelOutputs();
+
+ private:
+  ModelState(TRITONBACKEND_Model* triton_model);
+
+  TRITONSERVER_Error* AutoCompleteConfig();
+
+  TRITONSERVER_Error* ParseParameters();
+};
+
+}  // namespace triton::backend::pytorch
diff --git a/src/naming_convention.hh b/src/naming_convention.hh
new file mode 100644
index 0000000..756cba4
--- /dev/null
+++ b/src/naming_convention.hh
@@ -0,0 +1,40 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+
+namespace triton::backend::pytorch {
+
+// The naming convention followed for inputs/outputs in the model configuration.
+// Outputs don't support FORWARD_ARGUMENT.
+enum class NamingConvention {
+  NAMED_INDEX,
+  FORWARD_ARGUMENT,
+  STRICT_CONFIG_ORDERING
+};
+
+}  // namespace triton::backend::pytorch
diff --git a/src/string_utils.cc b/src/string_utils.cc
new file mode 100644
index 0000000..a605c7c
--- /dev/null
+++ b/src/string_utils.cc
@@ -0,0 +1,254 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "string_utils.hh"
+
+
+namespace triton::backend::pytorch {
+
+// This function will return a tensor's contents as a contiguous
+// chunk in system memory. In some cases this will require copying the data.
+// If that  happens, 'contiguous_buffer' will be set to hold the contiguous
+// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
+// conducted.  The data copy can be avoided if the input is already in
+// a contiguous chunk and the input is located in memory type and id
+// specified.
+TRITONSERVER_Error*
+GetContiguousInputContent(
+    TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
+    const char** content, size_t* content_byte_size,
+    std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy)
+{
+  *cuda_copy = false;
+
+  // Check input buffers to see if data copy is necessary
+  size_t chunk_count = 0;
+  bool type_mismatch = false;
+  uint64_t total_byte_size = 0;
+  for (size_t idx = 0; idx < buffer_count; ++idx) {
+    TRITONSERVER_MemoryType src_memory_type;
+    int64_t src_memory_type_id;
+    size_t src_byte_size;
+    const void* src_ptr;
+
+    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+        rinput, idx, &src_ptr, &src_byte_size, &src_memory_type,
+        &src_memory_type_id));
+
+    if (src_ptr != nullptr) {
+      chunk_count++;
+      total_byte_size += src_byte_size;
+      type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU);
+    }
+  }
+
+  if (chunk_count == 0) {
+    *content = nullptr;
+    *content_byte_size = 0;
+  } else if ((chunk_count == 1) && !type_mismatch) {
+    TRITONSERVER_MemoryType src_memory_type;
+    int64_t src_memory_type_id;
+    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+        rinput, 0, (const void**)content, content_byte_size, &src_memory_type,
+        &src_memory_type_id));
+  } else {
+    contiguous_buffer->resize(total_byte_size);
+
+    size_t offset = 0;
+    for (size_t i = 0; i < chunk_count; i++) {
+      bool cuda_used;
+      TRITONSERVER_MemoryType src_memory_type;
+      int64_t src_memory_type_id;
+      size_t src_byte_size;
+      const void* src_ptr;
+
+      RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+          rinput, i, &src_ptr, &src_byte_size, &src_memory_type,
+          &src_memory_type_id));
+      RETURN_IF_ERROR(CopyBuffer(
+          "Contiguous input", src_memory_type, src_memory_type_id,
+          TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr,
+          contiguous_buffer->data() + offset, stream, &cuda_used));
+      *cuda_copy |= cuda_used;
+      offset += src_byte_size;
+    }
+
+    *content = contiguous_buffer->data();
+    *content_byte_size = total_byte_size;
+  }
+
+  return nullptr;  // success
+}
+
+void
+FillStringTensor(torch::List<std::string>* input_list, const size_t cnt)
+{
+  for (size_t c = 0; c < cnt; ++c) {
+    input_list->push_back("");
+  }
+}
+
+bool
+SetStringBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
+    const size_t tensor_element_count, cudaStream_t stream,
+    std::string* serialized, bool state)
+{
+  bool cuda_copy = false;
+
+  // Serialize the output tensor strings. Each string is serialized as
+  // a 4-byte length followed by the string itself with no
+  // null-terminator.
+  serialized->clear();
+  for (size_t e = 0; e < tensor_element_count; ++e) {
+    std::string str = tensor->get(e).to<std::string>();
+    const char* cstr = str.c_str();
+    size_t len = str.length();
+    serialized->append(reinterpret_cast<const char*>(&len), sizeof(uint32_t));
+    if (len > 0) {
+      serialized->append(cstr, len);
+    }
+  }
+
+  // Allocate a buffer large enough to hold the serialized tensor.
+  TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
+  int64_t actual_memory_type_id = 0;
+
+  TRITONSERVER_Error* err;
+  void* buffer;
+
+  if (!state) {
+    auto err = TRITONBACKEND_OutputBuffer(
+        response_output, &buffer, serialized->size(), &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  } else {
+    auto err = TRITONBACKEND_StateBuffer(
+        response_state, &buffer, serialized->size(), &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  }
+  // Copy the serialized tensor into the allocated buffer.
+  bool cuda_used = false;
+  err = CopyBuffer(
+      "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */,
+      0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id,
+      serialized->size(), reinterpret_cast<const void*>(serialized->c_str()),
+      buffer, stream, &cuda_used);
+  cuda_copy |= cuda_used;
+
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    return cuda_copy;
+  }
+
+  if (state) {
+    RESPOND_AND_SET_NULL_IF_ERROR(
+        response, TRITONBACKEND_StateUpdate(response_state));
+  }
+
+  return cuda_copy;
+}
+
+bool
+SetStringInputTensor(
+    torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
+    const char* name, const uint32_t buffer_count,
+    const size_t request_element_cnt, TRITONBACKEND_Response** response,
+    cudaStream_t stream, const char* host_policy_name)
+{
+  bool cuda_copy = false;
+
+  // For string data type, we always need to have the data on CPU so
+  // that we can read string length and construct the string
+  // properly. So if the request's input tensor is not in CPU need to
+  // copy it there.
+  const char* content = nullptr;
+  size_t content_byte_size = 0;
+
+  std::vector<char> contiguous_buffer;
+  auto err = GetContiguousInputContent(
+      input, buffer_count, &content, &content_byte_size, &contiguous_buffer,
+      stream, &cuda_copy);
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    FillStringTensor(input_list, request_element_cnt);
+    return cuda_copy;
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(stream);
+    cuda_copy = false;
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  std::vector<std::pair<const char*, const uint32_t>> str_list;
+  err = ValidateStringBuffer(
+      content, content_byte_size, request_element_cnt, name, &str_list);
+  // Set string values.
+  for (const auto& [addr, len] : str_list) {
+    input_list->push_back(std::string(addr, len));
+  }
+
+  size_t element_cnt = str_list.size();
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    FillStringTensor(input_list, request_element_cnt - element_cnt);
+  }
+  return cuda_copy;
+}
+
+bool
+SetStringOutputBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized)
+{
+  return SetStringBuffer(
+      tensor, response, response_output, nullptr /* response_state */,
+      tensor_element_count, stream, serialized, false /* state */);
+}
+
+bool
+SetStringStateBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_State* response_state, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized)
+{
+  return SetStringBuffer(
+      tensor, response, nullptr /* response_output */, response_state,
+      tensor_element_count, stream, serialized, true /* state */);
+}
+
+}  // namespace triton::backend::pytorch
diff --git a/src/string_utils.hh b/src/string_utils.hh
new file mode 100644
index 0000000..8373478
--- /dev/null
+++ b/src/string_utils.hh
@@ -0,0 +1,106 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdint>
+#include <exception>
+#include <mutex>
+
+#include "libtorch_utils.h"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/common/nvtx.h"
+#include "triton/core/tritonbackend.h"
+
+#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
+// Suppress warnings in torch headers
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma warning(push, 0)
+#include <torchvision/ops/ops.h>
+#include <torchvision/vision.h>  // Torchvision header
+#pragma warning(pop)
+#pragma GCC diagnostic pop
+#endif  // TRITON_PYTORCH_ENABLE_TORCHVISION
+
+#ifdef TRITON_ENABLE_GPU
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+// for thread control
+// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api
+// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
+#include <ATen/Parallel.h>
+
+
+namespace triton::backend::pytorch {
+
+void FillStringTensor(torch::List<std::string>* input_list, const size_t cnt);
+
+// This function will return a tensor's contents as a contiguous
+// chunk in system memory. In some cases this will require copying the data.
+// If that  happens, 'contiguous_buffer' will be set to hold the contiguous
+// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
+// conducted.  The data copy can be avoided if the input is already in
+// a contiguous chunk and the input is located in memory type and id
+// specified.
+TRITONSERVER_Error* GetContiguousInputContent(
+    TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
+    const char** content, size_t* content_byte_size,
+    std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy);
+
+bool SetStringBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
+    const size_t tensor_element_count, cudaStream_t stream,
+    std::string* serialized, bool state);
+
+bool SetStringInputTensor(
+    torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
+    const char* name, const uint32_t buffer_count,
+    const size_t request_element_cnt, TRITONBACKEND_Response** response,
+    cudaStream_t stream, const char* host_policy_name);
+
+bool SetStringOutputBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized);
+
+bool SetStringStateBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_State* response_state, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized);
+
+}  // namespace triton::backend::pytorch
diff --git a/tools/gen_pb_exec_env.sh b/tools/gen_pb_exec_env.sh
new file mode 100755
index 0000000..19539cd
--- /dev/null
+++ b/tools/gen_pb_exec_env.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# install conda
+rm -rf ./miniconda
+wget https://repo.anaconda.com/miniconda/Miniconda3-py312_25.7.0-2-Linux-x86_64.sh
+bash Miniconda3-py312_25.7.0-2-Linux-x86_64.sh -p ./miniconda -b
+eval "$(./miniconda/bin/conda shell.bash hook)"
+
+# create conda environment
+conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
+conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
+conda create -n pt python=3.12 -y
+conda activate pt
+conda install -c conda-forge conda-pack -y
+
+# pre install step
+export PYTHONNOUSERSITE=True
+conda install -c conda-forge libstdcxx-ng=15 -y
+
+# install PyTorch
+conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia -y
+
+# pack environment
+rm -f pb_exec_env_model.py.tar.gz
+conda pack -o pb_exec_env_model.py.tar.gz
+
+# deactivate conda
+conda deactivate