diff --git a/.clang-format b/.clang-format
index 98c64973..bf96a593 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,6 +2,7 @@
 BasedOnStyle: Google
 
 IndentWidth: 2
+ColumnLimit: 80
 ContinuationIndentWidth: 4
 UseTab: Never
 MaxEmptyLinesToKeep: 2
@@ -34,4 +35,4 @@ BinPackArguments: true
 BinPackParameters: true
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 
-IndentCaseLabels: true
\ No newline at end of file
+IndentCaseLabels: true
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 00000000..737725bb
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,48 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+FROM nvcr.io/nvidia/tritonserver:24.03-py3
+
+ARG USERNAME=triton-server
+
+RUN apt-get update \
+    && apt-get install -y sudo
+
+RUN pip3 install transformers torch
+
+# Create the user
+RUN apt-get update \
+    && apt-get install -y sudo \
+    && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
+    && chmod 0440 /etc/sudoers.d/$USERNAME
+
+RUN pip3 install pre-commit ipdb
+
+RUN mkhomedir_helper triton-server
+
+RUN apt-get install -y cmake rapidjson-dev
+
+USER ${USERNAME}
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 00000000..e1b8bd10
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,26 @@
+{
+	"name": "Python Backend",
+
+	"build": {
+		"dockerfile": "Dockerfile"
+	},
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-python.vscode-pylance",
+				"ms-python.python",
+				"ms-vscode.cpptools-extension-pack",
+				"ms-vscode.cmake-tools",
+				"github.vscode-pull-request-github"
+			]
+		}
+	},
+	"postCreateCommand": "sudo chown -R triton-server:triton-server ~/.cache",
+
+	"runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined", "--gpus=all", "--shm-size=2g", "--ulimit", "stack=67108864" ],
+	"mounts": [
+		"source=${localEnv:HOME}/.ssh,target=/home/triton-server/.ssh,type=bind,consistency=cached",
+		"source=${localEnv:HOME}/.cache/huggingface,target=/home/triton-server/.cache/huggingface,type=bind,consistency=cached"
+	],
+	"remoteUser": "triton-server"
+}
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 00000000..745a3373
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,84 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "CodeQL"
+
+on:
+  pull_request:
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'python' ]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+
+        # Details on CodeQL's query packs refer to:
+        # https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+        queries: +security-and-quality
+
+
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v2
+
+    # Command-line programs to run using the OS shell.
+    # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+
+    #   If the Autobuild fails above, remove it and uncomment the following three lines.
+    #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
+
+    # - run: |
+    #   echo "Run, Build Application using script"
+    #   ./location_of_script_within_repo/buildscript.sh
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
+      with:
+        category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 00000000..4fa18732
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,38 @@
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: pre-commit
+
+on:
+  pull_request:
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v5.0.0
+    - uses: actions/setup-python@v6.0.0
+    - uses: pre-commit/action@v3.0.1
diff --git a/.gitignore b/.gitignore
index 61840bad..419005f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 /build
-/.vscode
 *.so
+builddir
 
 ### Python ###
 # Byte-compiled / optimized / DLL files
@@ -138,3 +138,6 @@ dmypy.json
 # pytype static type analyzer
 .pytype/
 
+# vscode
+.vscode/settings.json
+.vscode/c_cpp_properties.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..3c76a6ed
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,73 @@
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+repos:
+- repo: https://github.com/PyCQA/isort
+  rev: 5.12.0
+  hooks:
+  - id: isort
+    additional_dependencies: [toml]
+- repo: https://github.com/psf/black
+  rev: 23.1.0
+  hooks:
+  - id: black
+    types_or: [python, cython]
+- repo: https://github.com/PyCQA/flake8
+  rev: 7.3.0
+  hooks:
+  - id: flake8
+    args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
+    types_or: [python, cython]
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v16.0.5
+  hooks:
+  - id: clang-format
+    types_or: [c, c++, cuda, proto, textproto, java]
+    args: ["-fallback-style=none", "-style=file", "-i"]
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.2.4
+  hooks:
+  - id: codespell
+    additional_dependencies: [tomli]
+    args: ["--toml", "pyproject.toml"]
+    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
+# More details about these pre-commit hooks here:
+# https://pre-commit.com/hooks.html
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v6.0.0
+  hooks:
+  - id: check-case-conflict
+  - id: check-executables-have-shebangs
+  - id: check-merge-conflict
+  - id: check-json
+  - id: check-toml
+  - id: check-yaml
+  - id: check-shebang-scripts-are-executable
+  - id: end-of-file-fixer
+    types_or: [c, c++, cuda, proto, textproto, java, python]
+  - id: mixed-line-ending
+  - id: requirements-txt-fixer
+  - id: trailing-whitespace
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
new file mode 100644
index 00000000..597a746d
--- /dev/null
+++ b/.vscode/tasks.json
@@ -0,0 +1,85 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Configure",
+            "type": "shell",
+            "command": "cmake",
+            "args": [
+                "-DCMAKE_INSTALL_PREFIX:STRING=/opt/tritonserver/",
+                "-DTRITON_COMMON_REPO_TAG:STRING=main",
+                "-DTRITON_BACKEND_REPO_TAG:STRING=main",
+                "-DTRITON_CORE_REPO_TAG:STRING=main",
+                "-DTRITON_ENABLE_GPU:STRING=ON",
+                "-DTRITON_ENABLE_NVTX:STRING=ON",
+                "-DCMAKE_INSTALL_PREFIX:STRING=${workspaceFolder}/build/install",
+                "-DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE",
+                "-DCMAKE_BUILD_TYPE:STRING=Debug",
+                "-DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc",
+                "-DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++",
+                "-S${workspaceFolder}",
+                "-B${workspaceFolder}/build",
+                "-G",
+                "Unix Makefiles"
+            ],
+            "problemMatcher": []
+        },
+        {
+            "label": "Build",
+            "type": "shell",
+            "command": "cmake",
+            "args": [
+                "--build",
+                "/${workspaceFolder}/build",
+                "--config",
+                "Debug",
+                "--target",
+                "all",
+                "-j",
+                "18",
+                "--"
+            ]
+        },
+        {
+            "label": "Install",
+            "type": "shell",
+            "command": "cmake",
+            "args": [
+                "--build",
+                "${workspaceFolder}/build",
+                "--config",
+                "Debug",
+                "--target",
+                "install",
+                "-j",
+                "18",
+                "--"
+            ]
+        },
+        {
+            "label": "Move",
+            "type": "shell",
+            "command": "sudo",
+            "args": [
+                "cp",
+                "-r",
+                "${workspaceFolder}/build/install/backends/python/*",
+                "/opt/tritonserver/backends/python"
+            ]
+        },
+        {
+            "label": "Build Python Backend",
+            "dependsOrder": "sequence",
+            "dependsOn": [
+                "Configure",
+                "Build",
+                "Install",
+                "Move"
+            ],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        }
+    ]
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77d029c8..f5c5b293 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,30 +24,42 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cmake_minimum_required(VERSION 3.17)
+cmake_minimum_required(VERSION 3.31.8)
 
 project(tritonpythonbackend LANGUAGES C CXX)
 
+# Use C++17 standard as Triton's minimum required.
+set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.")
+
 #
 # Options
 #
 # Must include options required for this project as well as any
 # projects included in this one by FetchContent.
 #
-# GPU support is disabled by default because python backend doesn't
-# because python backend does not need to access CUDA or GPUs
-#
-option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
+option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
 
+# FIXME: CI needs to enable the GPU flag. Python for window currently does not
+# support GPU tensors. For simplicity, we will override this option here.
+if(WIN32)
+  set(TRITON_ENABLE_GPU OFF CACHE BOOL "GPU disabled" FORCE)
+endif()
+
+set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from")
 set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
 set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
 set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
 
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
 #
 # Dependencies
 #
-# FetchContent's composibility isn't very good. We must include the
+# FetchContent's composability isn't very good. We must include the
 # transitive closure of all repos so that we can override the tag.
 #
 include(FetchContent)
@@ -58,39 +70,64 @@ include(ExternalProject)
 
 FetchContent_Declare(
   repo-common
-  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git
   GIT_TAG ${TRITON_COMMON_REPO_TAG}
-  GIT_SHALLOW ON
 )
 FetchContent_Declare(
   repo-core
-  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git
   GIT_TAG ${TRITON_CORE_REPO_TAG}
-  GIT_SHALLOW ON
 )
 FetchContent_Declare(
   repo-backend
-  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git
   GIT_TAG ${TRITON_BACKEND_REPO_TAG}
-  GIT_SHALLOW ON
 )
 FetchContent_MakeAvailable(repo-common repo-core repo-backend)
 
 FetchContent_Declare(
   pybind11
   GIT_REPOSITORY "https://github.com/pybind/pybind11"
-  GIT_TAG "v2.6"
+  # COMMIT ID for v2.12.0
+  GIT_TAG "3e9dfa2866941655c56877882565e7577de6fc7b"
   GIT_SHALLOW ON
 )
+
+# RHEL base container has multiple version of Python installed. By default
+# it seems like pybind will pickup v3.6, so we specifically assign it to
+# search for 3.12 here.
+set(RHEL_BUILD OFF)
+if(LINUX)
+  file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
+  if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+    set(RHEL_BUILD ON)
+  endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+endif(LINUX)
 FetchContent_MakeAvailable(pybind11)
 
+#
+# DLPack
+#
+FetchContent_Declare(
+  dlpack
+  GIT_REPOSITORY "https://github.com/dmlc/dlpack"
+  GIT_TAG "v0.8"
+  GIT_SHALLOW ON
+)
+# Option must be set off so WIN32 build does not break
+set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+set(BUILD_MOCK OFF)
+FetchContent_MakeAvailable(dlpack)
+
 #
 # Boost
 #
+set(TRITON_BOOST_URL "https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz" CACHE STRING "Boost source code URL")
+
 ExternalProject_Add(
   boostorg
-  URL https://boostorg.jfrog.io/artifactory/main/release/1.76.0/source/boost_1_76_0.tar.gz
-  URL_HASH SHA256=7bd7ddceec1a1dfdcbdb3e609b60d01739c38390a5f956385a12f3122049f0ca
+  URL ${TRITON_BOOST_URL}
+  URL_HASH SHA256=4b2136f98bdd1f5857f1c3dea9ac2018effe65286cf251534b6ae20cc45e1847
   PREFIX "boost-src"
   CONFIGURE_COMMAND ${CMAKE_COMMAND} -E copy_directory
                     <SOURCE_DIR>/boost/ ${CMAKE_BINARY_DIR}/boost
@@ -99,30 +136,127 @@ ExternalProject_Add(
 )
 set(boostorg_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/boost/")
 
+#
+# CUDA
+#
+if(${TRITON_ENABLE_GPU})
+  find_package(CUDAToolkit REQUIRED)
+  message(STATUS "Using CUDA ${CUDA_VERSION}")
+  set(CUDA_NVCC_FLAGS -std=c++${TRITON_MIN_CXX_STANDARD})
+elseif()
+  message(WARNING "TRITON_ENABLE_GPU is OFF, GPU Tensor support will be disabled")
+endif() # TRITON_ENABLE_GPU
+
+if(${TRITON_ENABLE_NVTX})
+  add_definitions(-DTRITON_ENABLE_NVTX=1)
+endif() # TRITON_ENABLE_NVTX
+
 find_package(ZLIB REQUIRED)
-find_package(Threads REQUIRED)
+
+if(NOT WIN32)
+  find_package(Threads REQUIRED)
+endif()
 
 include_directories(${CMAKE_BINARY_DIR})
 configure_file(src/libtriton_python.ldscript libtriton_python.ldscript COPYONLY)
 
-add_library(
-  triton-python-backend SHARED
-  src/python.cc
+set(
+  COMMON_SRCS
+  src/correlation_id.cc
+  src/correlation_id.h
+  src/infer_response.cc
+  src/infer_response.h
+  src/infer_request.cc
+  src/infer_request.h
+  src/infer_trace.cc
+  src/infer_trace.h
+  src/message_queue.h
+  src/ipc_message.cc
+  src/ipc_message.h
+  src/pb_string.cc
+  src/pb_string.h
+  src/pb_map.cc
+  src/pb_map.h
+  src/scoped_defer.cc
+  src/scoped_defer.h
+  src/pb_error.cc
+  src/pb_error.h
+  src/pb_log.cc
+  src/pb_log.h
+  src/pb_memory.cc
+  src/pb_memory.h
+  src/pb_tensor.cc
+  src/pb_tensor.h
   src/pb_utils.cc
   src/pb_utils.h
-  src/pb_env.cc
-  src/pb_env.h
   src/shm_manager.cc
   src/shm_manager.h
+  src/pb_exception.h
+  src/pb_preferred_memory.h
+  src/metric.h
+  src/metric.cc
+  src/metric_family.h
+  src/metric_family.cc
+  src/gpu_buffers.cc
+  src/gpu_buffers.h
+  src/model_loader.h
+  src/model_loader.cc
+)
+
+set(
+    PYTHON_BACKEND_SRCS
+    src/python_be.cc
+    src/python_be.h
+    src/pb_env.cc
+    src/pb_env.h
+    src/pb_metric_reporter.cc
+    src/pb_metric_reporter.h
+    src/memory_manager.cc
+    src/memory_manager.h
+    src/request_executor.cc
+    src/request_executor.h
+    src/stub_launcher.h
+    src/stub_launcher.cc
+    src/infer_payload.h
+    src/infer_payload.cc
+)
+
+list(APPEND
+  PYTHON_BACKEND_SRCS
+  ${COMMON_SRCS}
+)
+
+add_library(
+  triton-python-backend SHARED
+  ${PYTHON_BACKEND_SRCS}
+)
+
+set(
+  PYTHON_BACKEND_STUB_SRCS
+  src/pb_stub_utils.h
+  src/pb_stub_utils.cc
+  src/response_sender.cc
+  src/response_sender.h
+  src/pb_stub.h
+  src/pb_stub.cc
+  src/pb_stub_log.h
+  src/pb_stub_log.cc
+  src/pb_response_iterator.h
+  src/pb_response_iterator.cc
+  src/pb_cancel.cc
+  src/pb_cancel.h
+  src/pb_bls_cancel.cc
+  src/pb_bls_cancel.h
+)
+
+list(APPEND
+  PYTHON_BACKEND_STUB_SRCS
+  ${COMMON_SRCS}
 )
 
 add_executable(
   triton-python-backend-stub
-  src/pb_stub.cc
-  src/pb_utils.cc
-  src/pb_utils.h
-  src/shm_manager.cc
-  src/shm_manager.h
+  ${PYTHON_BACKEND_STUB_SRCS}
 )
 
 add_dependencies(triton-python-backend boostorg)
@@ -134,46 +268,106 @@ add_library(
   TritonPythonBackend::triton-python-backend ALIAS triton-python-backend
 )
 
-target_compile_features(triton-python-backend PRIVATE cxx_std_11)
+target_compile_features(triton-python-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})
 target_compile_options(
   triton-python-backend PRIVATE
   $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
+    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits>
+  $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor>
 )
 
-target_compile_features(triton-python-backend-stub PRIVATE cxx_std_11)
+target_compile_features(triton-python-backend-stub PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})
 target_compile_options(
   triton-python-backend-stub PRIVATE
   $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-  -fvisibility=hidden -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
+    -fvisibility=hidden -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
+  $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor>
 )
+target_compile_definitions(triton-python-backend-stub PRIVATE TRITON_PB_STUB)
 
-target_link_libraries(
-  triton-python-backend
-  PRIVATE
-    triton-core-serverstub  # from repo-core
-    triton-backend-utils    # from repo-backend
-    ZLIB::ZLIB
-    -larchive               # shared memory 
-)
+# RHEL assets are not released in a container environment nor do the current
+# Python lib versions in the manylinux base container match those currently
+# available for RHEL8 package managers. Therefore, we package the correct
+# python libs in the backend folder and adjust the stub executable to look
+# in its own folder at runtime.
+if(RHEL_BUILD)
+  set_target_properties(
+    triton-python-backend-stub
+    PROPERTIES
+      SKIP_BUILD_RPATH TRUE
+      BUILD_WITH_INSTALL_RPATH TRUE
+      INSTALL_RPATH_USE_LINK_PATH FALSE
+      INSTALL_RPATH "$\{ORIGIN\}"
+  )
+endif(RHEL_BUILD)
 
-target_link_libraries(
-  triton-python-backend-stub
-  PRIVATE
-   Threads::Threads
-   pybind11::embed
-   triton-backend-utils    # from repo-backend
-   -larchive               # libarchive
-   -lrt                    # shared memory 
-)
 
-set_target_properties(
-  triton-python-backend PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-  OUTPUT_NAME triton_python
-  LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_python.ldscript
-  LINK_FLAGS "-Wl,--version-script libtriton_python.ldscript"
-)
+# For WIN32 do not link Threads and DL_LIBS
+if(WIN32)
+  target_link_libraries(
+    triton-python-backend
+    PRIVATE
+      dlpack
+      triton-backend-utils          # from repo-backend
+      -lrt                          # shared memory
+      triton-core-serverstub        # from repo-core
+      ZLIB::ZLIB
+      -larchive
+  )
+
+  target_link_libraries(
+    triton-python-backend-stub
+    PRIVATE
+    dlpack
+    triton-backend-utils           # from repo-backend
+    pybind11::embed
+    -lrt                           # shared memory
+    -larchive                      # libarchive
+  )
+else()
+  target_link_libraries(
+    triton-python-backend
+    PRIVATE
+      dlpack
+      Threads::Threads
+      triton-backend-utils          # from repo-backend
+      ${CMAKE_DL_LIBS}              # dlopen and dlclose
+      -lrt                          # shared memory
+      triton-core-serverstub        # from repo-core
+      ZLIB::ZLIB
+      -larchive
+  )
+
+  target_link_libraries(
+    triton-python-backend-stub
+    PRIVATE
+    dlpack
+    Threads::Threads
+    triton-backend-utils           # from repo-backend
+    ${CMAKE_DL_LIBS}               # dlopen and dlclose
+    pybind11::embed
+    -lrt                           # shared memory
+    -larchive                      # libarchive
+  )
+endif()
+
+if(WIN32)
+  set_target_properties(
+    triton-python-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_python
+  )
+else()
+  set_target_properties(
+    triton-python-backend PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_python
+    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_python.ldscript
+    LINK_FLAGS "-Wl,--version-script libtriton_python.ldscript"
+  )
+endif()
+
+add_subdirectory(./src/shm_monitor)
 
 #
 # Install
diff --git a/README.md b/README.md
index bba6a359..dd5e877a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -36,58 +36,107 @@ any C++ code.
 
 ## User Documentation
 
-* [Quick Start](#quick-start)
-* [Usage](#usage)
-* [Examples](#examples)
-* [Using Custom Python Execution Environments](#using-custom-python-execution-environments)
-* [Model Config File](#model-config-file)
-* [Error Hanldling](#error-handling)
-* [Managing Shared Memory](#managing-shared-memory)
-* [Building From Source](#building-from-source)
+- [Python Backend](#python-backend)
+  - [User Documentation](#user-documentation)
+  - [Quick Start](#quick-start)
+  - [Building from Source](#building-from-source)
+  - [Usage](#usage)
+    - [`auto_complete_config`](#auto_complete_config)
+    - [`initialize`](#initialize)
+    - [`execute`](#execute)
+      - [Default Mode](#default-mode)
+      - [Error Handling](#error-handling)
+      - [Request Cancellation Handling](#request-cancellation-handling)
+      - [Decoupled mode](#decoupled-mode)
+        - [Use Cases](#use-cases)
+        - [Async Execute](#async-execute)
+      - [Request Rescheduling](#request-rescheduling)
+    - [`finalize`](#finalize)
+  - [Model Config File](#model-config-file)
+  - [Inference Request Parameters](#inference-request-parameters)
+  - [Inference Response Parameters](#inference-response-parameters)
+  - [Managing Python Runtime and Libraries](#managing-python-runtime-and-libraries)
+    - [Building Custom Python Backend Stub](#building-custom-python-backend-stub)
+    - [Creating Custom Execution Environments](#creating-custom-execution-environments)
+    - [Important Notes](#important-notes)
+  - [Error Handling](#error-handling-1)
+  - [Managing Shared Memory](#managing-shared-memory)
+  - [Multiple Model Instance Support](#multiple-model-instance-support)
+  - [Running Multiple Instances of Triton Server](#running-multiple-instances-of-triton-server)
+- [Business Logic Scripting](#business-logic-scripting)
+  - [Using BLS with Decoupled Models](#using-bls-with-decoupled-models)
+  - [Model Loading API](#model-loading-api)
+  - [Using BLS with Stateful Models](#using-bls-with-stateful-models)
+  - [Limitation](#limitation)
+- [Interoperability and GPU Support](#interoperability-and-gpu-support)
+  - [`pb_utils.Tensor.to_dlpack() -> PyCapsule`](#pb_utilstensorto_dlpack---pycapsule)
+  - [`pb_utils.Tensor.from_dlpack() -> Tensor`](#pb_utilstensorfrom_dlpack---tensor)
+  - [`pb_utils.Tensor.is_cpu() -> bool`](#pb_utilstensoris_cpu---bool)
+  - [Input Tensor Device Placement](#input-tensor-device-placement)
+- [Frameworks](#frameworks)
+  - [PyTorch](#pytorch)
+    - [PyTorch Determinism](#pytorch-determinism)
+  - [TensorFlow](#tensorflow)
+    - [TensorFlow Determinism](#tensorflow-determinism)
+- [Custom Metrics](#custom-metrics)
+- [Examples](#examples)
+  - [AddSub in NumPy](#addsub-in-numpy)
+  - [AddSubNet in PyTorch](#addsubnet-in-pytorch)
+  - [AddSub in JAX](#addsub-in-jax)
+  - [Business Logic Scripting](#business-logic-scripting-1)
+  - [Preprocessing](#preprocessing)
+  - [Decoupled Models](#decoupled-models)
+  - [Model Instance Kind](#model-instance-kind)
+  - [Auto-complete config](#auto-complete-config)
+  - [Custom Metrics](#custom-metrics-1)
+- [Running with Inferentia](#running-with-inferentia)
+- [Logging](#logging)
+- [Development with VSCode](#development-with-vscode)
+- [Reporting problems, asking questions](#reporting-problems-asking-questions)
 
 ## Quick Start
 
 1. Run the Triton Inference Server container.
 ```
-$ docker run --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:<xx.yy>-py3
+docker run --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:<xx.yy>-py3
 ```
 Replace \<xx.yy\> with the Triton version (e.g. 21.05).
 
 2. Inside the container, clone the Python backend repository.
 
 ```
-$ git clone https://github.com/triton-inference-server/python_backend -b r<xx.yy>
+git clone https://github.com/triton-inference-server/python_backend -b r<xx.yy>
 ```
 
 3. Install example model.
 ```
-$ cd python_backend
-$ mkdir -p models/add_sub/1/
-$ cp examples/add_sub/model.py models/add_sub/1/model.py
-$ cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt
+cd python_backend
+mkdir -p models/add_sub/1/
+cp examples/add_sub/model.py models/add_sub/1/model.py
+cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt
 ```
 
 4. Start the Triton server.
 
 ```
-$ tritonserver --model-repository `pwd`/models
+tritonserver --model-repository `pwd`/models
 ```
 
 5. In the host machine, start the client container.
 
 ```
- docker run -ti --net host nvcr.io/nvidia/tritonserver:<xx.yy>-py3-sdk /bin/bash
+docker run -ti --net host nvcr.io/nvidia/tritonserver:<xx.yy>-py3-sdk /bin/bash
 ```
 
 6. In the client container, clone the Python backend repository.
 
 ```
-$ git clone https://github.com/triton-inference-server/python_backend -b r<xx.yy>
+git clone https://github.com/triton-inference-server/python_backend -b r<xx.yy>
 ```
 
 7. Run the example client.
 ```
-$ python3 python_backend/examples/add_sub/client.py
+python3 python_backend/examples/add_sub/client.py
 ```
 
 ## Building from Source
@@ -98,53 +147,59 @@ $ python3 python_backend/examples/add_sub/client.py
 * numpy
 * rapidjson-dev
 * libarchive-dev
+* zlib1g-dev
 
 ```
 pip3 install numpy
 ```
 
-On Ubuntu or Debian you can use the command below to install `rapidjson` and `libarchive`:
+On Ubuntu or Debian you can use the command below to install `rapidjson`, `libarchive`, and `zlib`:
 ```
-sudo apt-get install rapidjson-dev libarchive-dev
+sudo apt-get install rapidjson-dev libarchive-dev zlib1g-dev
 ```
 
-2. Build Python backend
+2. Build Python backend. Replace \<GIT\_BRANCH\_NAME\> with the GitHub branch
+   that you want to compile. For release branches it should be r\<xx.yy\> (e.g.
+   r21.06).
 
 ```
-$ mkdir build
-$ cd build
-$ cmake -DTRITON_ENABLE_GPU=ON -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
-$ make install
+mkdir build
+cd build
+cmake -DTRITON_ENABLE_GPU=ON -DTRITON_BACKEND_REPO_TAG=<GIT_BRANCH_NAME> -DTRITON_COMMON_REPO_TAG=<GIT_BRANCH_NAME> -DTRITON_CORE_REPO_TAG=<GIT_BRANCH_NAME> -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
+make install
 ```
 
 The following required Triton repositories will be pulled and used in
-the build. By default the "main" branch/tag will be used for each repo
-but the listed CMake argument can be used to override.
+the build. If the CMake variables below are not specified, "main" branch
+of those repositories will be used. \<GIT\_BRANCH\_NAME\> should be the same
+as the Python backend repository branch that you are trying to compile.
 
-* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
-* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
+* triton-inference-server/backend: `-DTRITON_BACKEND_REPO_TAG=<GIT_BRANCH_NAME>`
+* triton-inference-server/common: `-DTRITON_COMMON_REPO_TAG=<GIT_BRANCH_NAME>`
+* triton-inference-server/core: `-DTRITON_CORE_REPO_TAG=<GIT_BRANCH_NAME>`
 
-Set `DCMAKE_INSTALL_PREFIX` to the location where the Triton Server is installed. In the released containers,
-this location is `/opt/tritonserver`.
+
+Set `-DCMAKE_INSTALL_PREFIX` to the location where the Triton Server is
+installed. In the released containers, this location is `/opt/tritonserver`.
 
 3. Copy example model and configuration
 
 ```
-$ mkdir -p models/add_sub/1/
-$ cp examples/add_sub/model.py models/add_sub/1/model.py
-$ cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt
+mkdir -p models/add_sub/1/
+cp examples/add_sub/model.py models/add_sub/1/model.py
+cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt
 ```
 
 4. Start the Triton Server
 
 ```
-$ /opt/tritonserver/bin/tritonserver --model-repository=`pwd`/models
+/opt/tritonserver/bin/tritonserver --model-repository=`pwd`/models
 ```
 
 5. Use the client app to perform inference
 
 ```
-$ python3 examples/add_sub/client.py
+python3 examples/add_sub/client.py
 ```
 
 ## Usage
@@ -161,6 +216,96 @@ class TritonPythonModel:
     that is created must have "TritonPythonModel" as the class name.
     """
 
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        """`auto_complete_config` is called only once when loading the model
+        assuming the server was not started with
+        `--disable-auto-complete-config`. Implementing this function is
+        optional. No implementation of `auto_complete_config` will do nothing.
+        This function can be used to set `max_batch_size`, `input` and `output`
+        properties of the model using `set_max_batch_size`, `add_input`, and
+        `add_output`. These properties will allow Triton to load the model with
+        minimal model configuration in absence of a configuration file. This
+        function returns the `pb_utils.ModelConfig` object with these
+        properties. You can use the `as_dict` function to gain read-only access
+        to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
+        being returned from here will be used as the final configuration for
+        the model.
+
+        Note: The Python interpreter used to invoke this function will be
+        destroyed upon returning from this function and as a result none of the
+        objects created here will be available in the `initialize`, `execute`,
+        or `finalize` functions.
+
+        Parameters
+        ----------
+        auto_complete_model_config : pb_utils.ModelConfig
+          An object containing the existing model configuration. You can build
+          upon the configuration given by this object when setting the
+          properties for this model.
+
+        Returns
+        -------
+        pb_utils.ModelConfig
+          An object containing the auto-completed model configuration
+        """
+        inputs = [{
+            'name': 'INPUT0',
+            'data_type': 'TYPE_FP32',
+            'dims': [4],
+            # this parameter will set `INPUT0 as an optional input`
+            'optional': True
+        }, {
+            'name': 'INPUT1',
+            'data_type': 'TYPE_FP32',
+            'dims': [4]
+        }]
+        outputs = [{
+            'name': 'OUTPUT0',
+            'data_type': 'TYPE_FP32',
+            'dims': [4]
+        }, {
+            'name': 'OUTPUT1',
+            'data_type': 'TYPE_FP32',
+            'dims': [4]
+        }]
+
+        # Demonstrate the usage of `as_dict`, `add_input`, `add_output`,
+        # `set_max_batch_size`, and `set_dynamic_batching` functions.
+        # Store the model configuration as a dictionary.
+        config = auto_complete_model_config.as_dict()
+        input_names = []
+        output_names = []
+        for input in config['input']:
+            input_names.append(input['name'])
+        for output in config['output']:
+            output_names.append(output['name'])
+
+        for input in inputs:
+            # The name checking here is only for demonstrating the usage of
+            # `as_dict` function. `add_input` will check for conflicts and
+            # raise errors if an input with the same name already exists in
+            # the configuration but has different data_type or dims property.
+            if input['name'] not in input_names:
+                auto_complete_model_config.add_input(input)
+        for output in outputs:
+            # The name checking here is only for demonstrating the usage of
+            # `as_dict` function. `add_output` will check for conflicts and
+            # raise errors if an output with the same name already exists in
+            # the configuration but has different data_type or dims property.
+            if output['name'] not in output_names:
+                auto_complete_model_config.add_output(output)
+
+        auto_complete_model_config.set_max_batch_size(0)
+
+        # To enable a dynamic batcher with default settings, you can use
+        # auto_complete_model_config set_dynamic_batching() function. It is
+        # commented in this example because the max_batch_size is zero.
+        #
+        # auto_complete_model_config.set_dynamic_batching()
+
+        return auto_complete_model_config
+
     def initialize(self, args):
         """`initialize` is called only once when the model is being loaded.
         Implementing `initialize` function is optional. This function allows
@@ -172,7 +317,8 @@ class TritonPythonModel:
           Both keys and values are strings. The dictionary keys and values are:
           * model_config: A JSON string containing the model configuration
           * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
+          * model_instance_device_id: A string containing model instance device
+            ID
           * model_repository: Model repository path
           * model_version: Model version
           * model_name: Model name
@@ -200,13 +346,15 @@ class TritonPythonModel:
         responses = []
 
         # Every Python backend must iterate through list of requests and create
-        # an instance of pb_utils.InferenceResponse class for each of them. You
-        # should avoid storing any of the input Tensors in the class attributes
-        # as they will be overridden in subsequent inference requests. You can
-        # make a copy of the underlying NumPy array and store it if it is
-        # required.
+        # an instance of pb_utils.InferenceResponse class for each of them.
+        # Reusing the same pb_utils.InferenceResponse object for multiple
+        # requests may result in segmentation faults. You should avoid storing
+        # any of the input Tensors in the class attributes as they will be
+        # overridden in subsequent inference requests. You can make a copy of
+        # the underlying NumPy array and store it if it is required.
         for request in requests:
-            # Perform inference on the request and append it to responses list...
+            # Perform inference on the request and append it to responses
+            # list...
 
         # You must return a list of pb_utils.InferenceResponse. Length
         # of this list must match the length of `requests` list.
@@ -221,7 +369,57 @@ class TritonPythonModel:
 
 ```
 
-Every Python backend can implement three main functions:
+Every Python backend can implement four main functions:
+
+### `auto_complete_config`
+
+`auto_complete_config` is called only once when loading the model assuming
+the server was not started with
+[`--disable-auto-complete-config`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#auto-generated-model-configuration).
+
+Implementing this function is optional. No implementation of
+`auto_complete_config` will do nothing. This function can be used to set
+[`max_batch_size`](
+  https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#maximum-batch-size),
+[dynamic_batching](
+  https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher),
+[`input`](
+  https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs)
+and
+[`output`](
+  https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs)
+properties of the model using `set_max_batch_size`, `set_dynamic_batching`,
+`add_input`, and `add_output`. These properties will allow Triton to load the
+model with
+[minimal model configuration](
+  https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#minimal-model-configuration)
+in absence of a configuration file. This function returns the
+`pb_utils.ModelConfig` object with these properties. You can use the `as_dict`
+function to gain read-only access to the `pb_utils.ModelConfig` object.
+The `pb_utils.ModelConfig` object being returned from here will be used as the
+final configuration for the model.
+
+In addition to minimal properties, you can also set [model_transaction_policy](
+  https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#model-transaction-policy)
+through `auto_complete_config` using `set_model_transaction_policy`.
+For example,
+```python
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+      ...
+      transaction_policy = {"decoupled": True}
+      auto_complete_model_config.set_model_transaction_policy(transaction_policy)
+      ...
+```
+
+Note: The Python interpreter used to invoke this function will be destroyed
+upon returning from this function and as a result none of the objects
+created here will be available in the `initialize`, `execute`, or `finalize`
+functions.
 
 ### `initialize`
 
@@ -244,13 +442,53 @@ below:
 
 ### `execute`
 
-`execute` function is called whenever an inference request is made. Every Python
-model must implement `execute` function. In the `execute` function you are given
-a list of `InferenceRequest` objects. In this function, your `execute` function
-must return a list of `InferenceResponse` objects that has the same length as
-`requests`.
+`execute` function is called whenever an inference request is made. Every
+Python model must implement `execute` function. In the `execute` function you
+are given a list of `InferenceRequest` objects. There are two modes of
+implementing this function. The mode you choose should depend on your use case.
+That is whether or not you want to return decoupled responses from this model
+or not.
+
+#### Default Mode
+
+This is the most generic way you would like to implement your model and
+requires the `execute` function to return exactly one response per request.
+This entails that in this mode, your `execute` function must return a list of
+`InferenceResponse` objects that has the same length as `requests`. The work
+flow in this mode is:
+
+* `execute` function receives a batch of pb_utils.InferenceRequest as a
+  length N array.
+
+* Perform inference on the pb_utils.InferenceRequest and append the
+  corresponding pb_utils.InferenceResponse to a response list.
+
+* Return back the response list.
+
+  * The length of response list being returned must be N.
+
+  * Each element in the list should be the response for the corresponding
+    element in the request array.
 
-In case one of the inputs has an error, you can use the `TritonError` object
+  * Each element must contain a response (a response can be either output
+    tensors or an error); an element cannot be None.
+
+
+Triton checks to ensure that these requirements on response list are
+satisfied and if not returns an error response for all inference requests.
+Upon return from the execute function all tensor data associated with the
+InferenceRequest objects passed to the function are deleted, and so
+InferenceRequest objects should not be retained by the Python model.
+
+Starting from 24.06, models may choose to send the response using the
+`InferenceResponseSender` as illustrated on [Decoupled mode](#decoupled-mode).
+Since the model is in default mode, it must send exactly one response per
+request. The `pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag must be sent
+either with the response or as a flag only response afterward.
+
+#### Error Handling
+
+In case one of the requests has an error, you can use the `TritonError` object
 to set the error message for that specific request. Below is an example of
 setting errors for an `InferenceResponse` object:
 
@@ -266,19 +504,251 @@ class TritonPythonModel:
 
         for request in requests:
             if an_error_occurred:
-              # If there is an error, the output_tensors are ignored
+              # If there is an error, there is no need to pass the
+              # "output_tensors" to the InferenceResponse. The "output_tensors"
+              # that are passed in this case will be ignored.
               responses.append(pb_utils.InferenceResponse(
-                output_tensors=[], error=pb_utils.TritonError("An Error Occurred")))
+                error=pb_utils.TritonError("An Error Occurred")))
 
         return responses
 ```
 
+Starting from 23.09, `pb_utils.TritonError` may be constructed with an optional
+Triton error code on the second parameter. For example:
+
+```python
+pb_utils.TritonError("The file is not found", pb_utils.TritonError.NOT_FOUND)
+```
+
+If no code is specified, `pb_utils.TritonError.INTERNAL` will be used by default.
+
+Supported error codes:
+* `pb_utils.TritonError.UNKNOWN`
+* `pb_utils.TritonError.INTERNAL`
+* `pb_utils.TritonError.NOT_FOUND`
+* `pb_utils.TritonError.INVALID_ARG`
+* `pb_utils.TritonError.UNAVAILABLE`
+* `pb_utils.TritonError.UNSUPPORTED`
+* `pb_utils.TritonError.ALREADY_EXISTS`
+* `pb_utils.TritonError.CANCELLED` (since 23.10)
+
+#### Request Cancellation Handling
+
+One or more requests may be cancelled by the client during execution. Starting
+from 23.10, `request.is_cancelled()` returns whether the request is cancelled or
+not. For example:
+
+```python
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+    ...
+
+    def execute(self, requests):
+        responses = []
+
+        for request in requests:
+            if request.is_cancelled():
+                responses.append(pb_utils.InferenceResponse(
+                    error=pb_utils.TritonError("Message", pb_utils.TritonError.CANCELLED)))
+            else:
+                ...
+
+        return responses
+```
+
+Although checking for request cancellation is optional, it is recommended to
+check for cancellation at strategic request execution stages that can early
+terminate the execution in the event of its response is no longer needed.
+
+#### Decoupled mode
+
+This mode allows user to send multiple responses for a request or
+not send any responses for a request. A model may also send
+responses out-of-order relative to the order that the request batches
+are executed. Such models are called *decoupled* models. In
+order to use this mode, the
+[transaction policy](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#model-transaction-policy)
+in the model configuration must be set to decoupled.
+
+
+In decoupled mode, model must use `InferenceResponseSender` object per
+request to keep creating and sending any number of responses for the
+request. The workflow in this mode may look like:
+
+* `execute` function receives a batch of pb_utils.InferenceRequest as a
+  length N array.
+
+* Iterate through each pb_utils.InferenceRequest and perform for the following
+  steps for each pb_utils.InferenceRequest object:
+
+  1. Get `InferenceResponseSender` object for the InferenceRequest using
+     InferenceRequest.get_response_sender().
+
+  2. Create and populate pb_utils.InferenceResponse to be sent back.
+
+  3. Use InferenceResponseSender.send() to send the above response. If
+     this is the last request then pass
+     pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL as a flag with
+     InferenceResponseSender.send(). Otherwise continue with Step 1 for sending
+     next request.
+
+* The return value for `execute` function in this mode should be None.
+
+Similar to above, in case one of the requests has an error, you can use
+the `TritonError` object to set the error message for that specific
+request. After setting errors for an pb_utils.InferenceResponse
+object, use InferenceResponseSender.send() to send response with the
+error back to the user.
+
+Starting from 23.10, request cancellation can be checked directly on the
+`InferenceResponseSender` object using `response_sender.is_cancelled()`. Sending
+the TRITONSERVER_RESPONSE_COMPLETE_FINAL flag at the end of response is still
+needed even the request is cancelled.
+
+##### Use Cases
+
+The decoupled mode is powerful and supports various other use cases:
+
+* If the model should not send any response for the request,
+  then call InferenceResponseSender.send() with no response
+  but flag parameter set to pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL.
+
+* The model can also send responses out-of-order in which it received
+  requests.
+
+* The request data and `InferenceResponseSender` object can be passed to
+  a separate thread in the model. This means main caller thread can exit
+  from `execute` function and the model can still continue generating
+  responses as long as it holds `InferenceResponseSender` object.
+
+
+The [decoupled examples](examples/decoupled/README.md) demonstrate
+full power of what can be achieved from decoupled API. Read
+[Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md)
+for more details on how to host a decoupled model.
+
+##### Async Execute
+
+Starting from 24.04, `async def execute(self, requests):` is supported for
+decoupled Python models. Its coroutine will be executed by an AsyncIO event loop
+shared with requests executing in the same model instance. The next request for
+the model instance can start executing while the current request is waiting.
+
+This is useful for minimizing the number of model instances for models that
+spend the majority of its time waiting, given requests can be executed
+concurrently by AsyncIO. To take full advantage of the concurrency, it is vital
+for the async execute function to not block the event loop from making progress
+while it is waiting, i.e. downloading over the network.
+
+Notes:
+* The model should not modify the running event loop, as this might cause
+unexpected issues.
+* The server/backend do not control how many requests are added to the event
+loop by a model instance.
+
+#### Request Rescheduling
+
+Starting from 23.11, Python backend supports request rescheduling. By calling
+the `set_release_flags` function on the request object with the flag
+`pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE`, you can reschedule the
+request for further execution in a future batch. This feature is useful for
+handling iterative sequences.
+
+The model config must be configured to enable iterative sequence batching in
+order to use the request rescheduling API:
+
+```
+sequence_batching {
+  iterative_sequence : true
+}
+```
+
+For non-decoupled models, there can only be one response for each request. Since
+the rescheduled request is the same as the original, you must append a `None`
+object to the response list for the rescheduled request. For example:
+
+```python
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+    ...
+
+    def execute(self, requests):
+        responses = []
+
+        for request in requests:
+            # Explicitly reschedule the first request
+            if self.idx == 0:
+                request.set_release_flags(
+                    pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE
+                )
+                responses.append(None)
+                self.idx += 1
+            else:
+                responses.append(inference_response)
+
+        return responses
+```
+
+For decoupled models, it is required to reschedule a request *before* returning
+from the `execute` function.
+Below is an example of a decoupled model using request rescheduling. This model
+takes 1 input tensor, an INT32 [ 1 ] input named "IN", and produces an output
+tensor "OUT" with the same shape as the input tensor. The input value indicates
+the total number of responses to be generated and the output value indicates the
+number of remaining responses. For example, if the request input has value 2,
+the model will:
+  - Send a response with value 1.
+  - Release request with RESCHEDULE flag.
+  - When execute on the same request, send the last response with value 0.
+  - Release request with ALL flag.
+
+```python
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+    ...
+
+    def execute(self, requests):
+        responses = []
+
+        for request in requests:
+            in_input = pb_utils.get_input_tensor_by_name(request, "IN").as_numpy()
+
+            if self.reset_flag:
+                self.remaining_response = in_input[0]
+                self.reset_flag = False
+
+            response_sender = request.get_response_sender()
+
+            self.remaining_response -= 1
+
+            out_output = pb_utils.Tensor(
+                "OUT", np.array([self.remaining_response], np.int32)
+            )
+            response = pb_utils.InferenceResponse(output_tensors=[out_output])
+
+            if self.remaining_response <= 0:
+                response_sender.send(
+                    response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                )
+                self.reset_flag = True
+            else:
+                request.set_release_flags(
+                    pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE
+                )
+                response_sender.send(response)
+
+        return None
+```
+
 ### `finalize`
 
 Implementing `finalize` is optional. This function allows you to do any clean
 ups necessary before the model is unloaded from Triton server.
 
-You can look at the [add_sub example](examples/add_sub.py) which contains
+You can look at the [add_sub example](examples/add_sub/model.py) which contains
 a complete example of implementing all these functions for a Python model
 that adds and subtracts the inputs given to it. After implementing all the
 necessary functions, you should save this file as `model.py`.
@@ -286,9 +756,9 @@ necessary functions, you should save this file as `model.py`.
 ## Model Config File
 
 Every Python Triton model must provide a `config.pbtxt` file describing
-the model configuration. In order to use this backend you must set the `backend`
-field of your model `config.pbtxt` file to `python`. You shouldn't set
-`platform` field of the configuration.
+the model configuration. In order to use this backend you must set the
+`backend` field of your model `config.pbtxt` file to `python`. You
+shouldn't set `platform` field of the configuration.
 
 Your models directory should look like below:
 ```
@@ -299,64 +769,120 @@ models
     └── config.pbtxt
 ```
 
-## Using Custom Python Execution Environments
+## Inference Request Parameters
 
-Python backend shipped in the [NVIDIA GPU Cloud](https://ngc.nvidia.com/)
-containers uses Python 3.8. If your Python model is compatible with Python 3.8
-and requires only modules already included in the Triton container, then you can
-skip this section. If you need to use a different version of Python or if you
-have additional dependencies, you need to recompile the stub executable and
-create an execution environment as described below and include that with your
-model.
+You can retrieve the parameters associated with an inference request
+using the `inference_request.parameters()` function. This function
+returns a JSON string where the keys are the keys of the parameters
+object and the values are the values for the parameters field. Note that
+you need to parse this string using `json.loads` to convert it to a dictionary.
 
-### 1. Building Custom Python Backend Stub
+Starting from 23.11 release, parameters may be provided to the `InferenceRequest`
+object during construction. The parameters should be a dictionary of key value
+pairs, where keys are `str` and values are `bool`, `int` or `str`.
+```python
+request = pb_utils.InferenceRequest(parameters={"key": "value"}, ...)
+```
 
-Python backend uses a *stub* process to connect your `model.py` file to the
-Triton C++ core. This stub process has an embedded Python interpreter with
-a fixed Python version. If you intend to use a Python interpreter with
-different version from the default Python backend stub, you need to compile your own
-Python backend stub by following the steps below:
+You can read more about the inference request parameters in the [parameters
+extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md)
+documentation.
 
-1. Install the software packages below:
-* [conda](https://docs.conda.io/en/latest/)
-* [cmake](https://cmake.org)
-* rapidjson and libarchive (instructions for installing these packages in Ubuntu or Debian are included in [Building from Source Section](#building-from-source))
+## Inference Response Parameters
 
+Inference response parameters may be optionally set during the construction of
+an inference response object. The parameters should be a dictionary of key value
+pairs, where keys are `str` and values are `bool`, `int` or `str`. For example,
+```python
+response = pb_utils.InferenceResponse(
+    output_tensors, parameters={"key": "value"}
+)
+```
 
-2. Create and activate a [conda](https://docs.conda.io/en/latest/) environment with your desired Python version. In this example, we will be using Python 3.6:
-```bash
-conda create -n python-3-6 python=3.6
-conda activate python-3-6
+You can read more about the inference response parameters in the [parameters
+extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md)
+documentation.
 
-# NumPy is required for Python models
-conda install numpy
-```
-3. Clone the Python backend repository and compile the Python backend stub (replace \<xx.yy\> with the release number):
+The parameters associated with an inference response can be retrieved using the
+`inference_response.parameters()` function. This function returns a JSON string
+where the keys are the keys of the parameters object and the values are the
+values for the parameters field. Note that you need to parse this string using
+`json.loads` to convert it to a dictionary.
+
+## Managing Python Runtime and Libraries
+
+Python backend shipped in the [NVIDIA GPU Cloud](https://ngc.nvidia.com/)
+containers uses Python 3.10. Python backend is able to use the libraries
+that exist in the current Python environment. These libraries can
+be installed in a virtualenv, conda environment, or the global system
+Python. These libraries will only be used if the Python version matches
+the Python version of the Python backend's stub executable. For example,
+if you install a set of libraries in a Python 3.9 environment and your
+Python backend stub is compiled with Python 3.10 these libraries will NOT
+be available in your Python model served using Triton. You would need to
+compile the stub executable with Python 3.9 using the instructions in
+[Building Custom Python Backend Stub](#building-custom-python-backend-stub)
+section.
+
+### Building Custom Python Backend Stub
+
+**Important Note: You only need to compile a custom Python backend stub if the
+Python version is different from Python 3.10 which is shipped by
+default in the Triton containers.**
+
+Python backend uses a *stub* process to connect your `model.py` file to the
+Triton C++ core. This stub process dynamically links to a specific
+`libpython<X>.<Y>.so` version. If you intend to use a Python interpreter with
+different version from the default Python backend stub, you need to compile
+your own Python backend stub by following the steps below:
+
+1. Install the software packages below:
+* [cmake](https://cmake.org)
+* rapidjson and libarchive (instructions for installing these packages in
+Ubuntu or Debian are included in
+[Building from Source Section](#building-from-source))
+
+2. Make sure that the expected Python version is available in your environment.
+
+If you are using `conda`, you should make sure to activate the environment by
+`conda activate <conda-env-name>`. Note that you don't have to use `conda` and
+can install Python however you wish. Python backend relies on
+[pybind11](https://github.com/pybind/pybind11) to find the correct Python
+version. If you noticed that the correct Python version is not picked up, you
+can read more on how
+[pybind11 decides which Python to use](https://pybind11.readthedocs.io/en/stable/faq.html?highlight=cmake#cmake-doesn-t-detect-the-right-python-version).
+
+3. Clone the Python backend repository and compile the Python backend stub
+   (replace \<GIT\_BRANCH\_NAME\> with the branch name that you want to use,
+   for release branches it should be r\<xx.yy\>):
 ```bash
-$ git clone https://github.com/triton-inference-server/python_backend -b <xx.yy>
-$ cd python_backend
-$ mkdir build && cd build
-$ cmake -DTRITON_ENABLE_GPU=ON -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
-$ make triton-python-backend-stub
+git clone https://github.com/triton-inference-server/python_backend -b
+<GIT_BRANCH_NAME>
+cd python_backend
+mkdir build && cd build
+cmake -DTRITON_ENABLE_GPU=ON -DTRITON_BACKEND_REPO_TAG=<GIT_BRANCH_NAME> -DTRITON_COMMON_REPO_TAG=<GIT_BRANCH_NAME> -DTRITON_CORE_REPO_TAG=<GIT_BRANCH_NAME> -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
+make triton-python-backend-stub
 ```
 
-Now, you have access to a Python backend stub with Python 3.6. You can verify
+Now, you have access to a Python backend stub with your Python version. You can verify
 that using `ldd`:
 
 ```
-$ ldd triton_python_backend_stub
+ldd triton_python_backend_stub
 ...
 libpython3.6m.so.1.0 => /home/ubuntu/envs/miniconda3/envs/python-3-6/lib/libpython3.6m.so.1.0 (0x00007fbb69cf3000)
 ...
 ```
 
 There are many other shared libraries printed in addition to the library posted
-above. However, it is important to see `libpython3.6m.so.1.0` in the list of
-linked shared libraries. If you use a different Python version, you should see
-that version instead. You need to copy the `triton_python_backend_stub` to the
-model directory of the models that want to use the custom Python backend
-stub. For example, if you have `model_a` in your [model repository](https://github.com/triton-inference-server/server/blob/main/docs/model_repository.md), the folder
-structure should look like below:
+above. However, it is important to see `libpython<major>.<minor>m.so.1.0` in the
+list of linked shared libraries. If you use a different Python version, you
+should see that version instead. You need to copy the
+`triton_python_backend_stub` to the model directory of the models that want to
+use the custom Python backend
+stub. For example, if you have `model_a` in your
+[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md),
+the folder structure should look like below:
 
 ```
 models
@@ -367,25 +893,51 @@ models
     `-- triton_python_backend_stub
 ```
 
-Note the location of `triton_python_backend_stub` in the directory structure above.
+Note the location of `triton_python_backend_stub` in the directory structure
+above.
 
-### 2. Packaging the Conda Environment
+### Creating Custom Execution Environments
 
-It is also required to create a tar file that contains your conda environment.
-Currently, Python backend only supports
+If you want to create a tar file that contains all your Python dependencies or
+you want to use different Python environments for each Python model you need to
+create a *Custom Execution Environment* in Python backend.
+Currently, Python backend supports
 [conda-pack](https://conda.github.io/conda-pack/) for this purpose.
 [conda-pack](https://conda.github.io/conda-pack/) ensures that your conda
 environment is portable. You can create a tar file for your conda environment
 using `conda-pack` command:
 
 ```
-$ conda-pack
+conda-pack
 Collecting packages...
 Packing environment at '/home/iman/miniconda3/envs/python-3-6' to 'python-3-6.tar.gz'
 [########################################] | 100% Completed |  4.5s
 ```
 
-After creating the tar file from the conda environment, you need to tell Python
+**Important Note:** Before installing the packages in your conda environment,
+make sure that you have exported
+[`PYTHONNOUSERSITE`](https://docs.python.org/3/using/cmdline.html#envvar-PYTHONNOUSERSITE)
+environment variable:
+
+```
+export PYTHONNOUSERSITE=True
+```
+
+If this variable is not exported and similar packages are installed outside your
+conda environment, your tar file may not contain all the dependencies required
+for an isolated Python environment.
+
+Alternatively, Python backend also supports unpacked conda execution
+environments, given it points to an activation script to setup the conda
+environment. To do this, the execution environment can be first packed using
+[conda-pack](https://conda.github.io/conda-pack/) and then unpacked, or created
+using [conda create -p](https://docs.conda.io/projects/conda/en/latest/commands/create.html).
+In this case, the conda activation script is located in:
+```$path_to_conda_pack/lib/python<your.python.version>/site-packages/conda_pack/scripts/posix/activate```
+This speeds up the server loading time for models.
+
+After creating the packed file from the conda environment or creating a conda
+environment with a custom activation script, you need to tell Python
 backend to use that environment for your model. You can do this by adding the
 lines below to the `config.pbtxt` file:
 
@@ -401,23 +953,99 @@ parameters: {
 }
 ```
 
+It is also possible to provide the execution environment path relative to the
+model folder in model repository:
+
+```
+name: "model_a"
+backend: "python"
+
+...
+
+parameters: {
+  key: "EXECUTION_ENV_PATH",
+  value: {string_value: "$$TRITON_MODEL_DIRECTORY/python3.6.tar.gz"}
+}
+```
+
+In this case, `python3.tar.gz` should be placed in the model folder and the
+model repository should look like below:
+
+```
+models
+|-- model_a
+|   |-- 1
+|   |   `-- model.py
+|   |-- config.pbtxt
+|   |-- python3.6.tar.gz
+|   `-- triton_python_backend_stub
+```
+
+In the example above, `$$TRITON_MODEL_DIRECTORY` is resolved to
+`$pwd/models/model_a`.
+
+To accelerate the loading time of `model_a`, you can follow the steps below to
+unpack the conda environment in the model folder:
+
+```bash
+mkdir -p $pwd/models/model_a/python3.6
+tar -xvf $pwd/models/model_a/python3.6.tar.gz -C $pwd/models/model_a/python3.6
+```
+
+Then you can change the `EXECUTION_ENV_PATH` to point to the unpacked directory:
+
+```
+parameters: {
+  key: "EXECUTION_ENV_PATH",
+  value: {string_value: "$$TRITON_MODEL_DIRECTORY/python3.6"}
+}
+```
+
+This is useful if you want to use S3, GCS, or Azure and you do not have access
+to the absolute path of the execution env that is stored in the cloud object
+storage service.
+
 ### Important Notes
 
 1. The version of the Python interpreter in the execution environment must match
 the version of triton_python_backend_stub.
 
 2. If you don't want to use a different Python interpreter, you can skip
-[Building Custom Python Backend Stub Step](#1-building-custom-python-backend-stub).
+[Building Custom Python Backend Stub](#building-custom-python-backend-stub).
 In this case you only need to pack your environment using `conda-pack` and
 provide the path to tar file in the model config. However, the previous note
 still applies here and the version of the Python interpreter inside the conda
 environment must match the Python version of stub used by Python backend. The
-default version of the stub is Python 3.8.
+default version of the stub is Python 3.10.
 
-3. You can share a single execution environment across multiple models. You need to
-provide the path to the tar file in the `EXECUTION_ENV_PATH` in the
+3. You can share a single execution environment across multiple models. You
+need to provide the path to the tar file in the `EXECUTION_ENV_PATH` in the
 `config.pbtxt` of all the models that want to use the execution environment.
 
+4. If `$$TRITON_MODEL_DIRECTORY` is used in the `EXECUTION_ENV_PATH`, the final
+`EXECUTION_ENV_PATH` **must not** escape from the `$$TRITON_MODEL_DIRECTORY`,
+as the behavior of accessing anywhere outside the `$$TRITON_MODEL_DIRECTORY` is
+**undefined**.
+
+5. If a non-`$$TRITON_MODEL_DIRECTORY` `EXECUTION_ENV_PATH` is used, only local
+file system paths are currently supported. The behavior of using cloud paths is
+**undefined**.
+
+6. If you need to compile the Python backend stub, it is recommended that you
+compile it in the official Triton NGC containers. Otherwise, your compiled stub
+may use dependencies that are not available in the Triton container that you are
+using for deployment. For example, compiling the Python backend stub on an OS
+other than Ubuntu 22.04 can lead to unexpected errors.
+
+7. If you encounter the "GLIBCXX_3.4.30 not found" error during runtime, we
+recommend upgrading your conda version and installing `libstdcxx-ng=12` by
+running `conda install -c conda-forge libstdcxx-ng=12 -y`. If this solution does
+not resolve the issue, please feel free to open an issue on the
+[GitHub issue page](https://github.com/triton-inference-server/server/issues)
+following the provided
+[instructions](https://github.com/triton-inference-server/server#reporting-problems-asking-questions).
+
+
 ## Error Handling
 
 If there is an error that affects the `initialize`, `execute`, or `finalize`
@@ -433,7 +1061,8 @@ class TritonPythonModel:
 
     def finalize(self):
       if error_during_finalize:
-        raise pb_utils.TritonModelException("An error occurred during finalize.")
+        raise pb_utils.TritonModelException(
+          "An error occurred during finalize.")
 ```
 
 ## Managing Shared Memory
@@ -442,8 +1071,8 @@ Starting from 21.04 release, Python backend uses shared memory to connect
 user's code to Triton. Note that this change is completely transparent and
 does not require any change to the existing user's model code.
 
-Python backend, by default, allocates 64 MBs for each model instance. Then,
-it will grow the shared memory region by 64 MBs whenever an increase is
+Python backend, by default, allocates 1 MB for each model instance. Then,
+it will grow the shared memory region by 1 MB chunks whenever an increase is
 required. You can configure the default shared memory used by each model
 instance using the `shm-default-byte-size` flag. The amount of shared memory
 growth can be configured using the `shm-growth-byte-size`.
@@ -452,8 +1081,8 @@ You can also configure the timeout used for connecting Triton main process
 to the Python backend stubs using the `stub-timeout-seconds`. The default
 value is 30 seconds.
 
-The config values described above can be passed to Triton using `--backend-config`
-flag:
+The config values described above can be passed to Triton using
+`--backend-config` flag:
 
 ```
 /opt/tritonserver/bin/tritonserver --model-repository=`pwd`/models --backend-config=python,<config-key>=<config-value>
@@ -464,27 +1093,821 @@ properly set the `--shm-size` flag depending on the size of your inputs and
 outputs. The default value for docker run command is `64MB` which is very
 small.
 
+## Multiple Model Instance Support
+
+Python interpreter uses a global lock known as
+[GIL](https://docs.python.org/3/c-api/init.html#thread-state-and-the-global-interpreter-lock).
+Because of GIL, it is not possible have multiple threads running in the same
+Python interpreter simultaneously as each thread requires to acquire the GIL
+when accessing Python objects which will serialize all the operations. In order
+to work around this issue, Python backend spawns a separate process for each
+[model instance](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#multiple-model-instances).
+This is in contrast with how other Triton backends such as
+[ONNXRuntime](https://github.com/triton-inference-server/onnxruntime_backend),
+[TensorFlow](https://github.com/triton-inference-server/tensorflow_backend),
+and [PyTorch](https://github.com/triton-inference-server/pytorch_backend)
+handle multiple instances. Increasing the instance count for these backends
+will create additional threads instead of spawning separate processes.
+
+## Running Multiple Instances of Triton Server
+
+Starting from 24.04 release, Python backend uses UUID to generate unique
+names for Python backend shared memory regions so that multiple instances of
+the server can run at the same time without any conflicts.
+
+If you're using a Python backend released before the 24.04 release, you need
+to specify different `shm-region-prefix-name` using the `--backend-config` flag
+to avoid conflicts between the shared memory regions. For example:
+
+```
+# Triton instance 1
+tritonserver --model-repository=/models --backend-config=python,shm-region-prefix-name=prefix1
+
+# Triton instance 2
+tritonserver --model-repository=/models --backend-config=python,shm-region-prefix-name=prefix2
+```
+
+Note that the hangs would only occur if the `/dev/shm` is shared between
+the two instances of the server. If you run the servers in different containers that
+don't share this location, you don't need to specify `shm-region-prefix-name`.
+
+# Business Logic Scripting
+
+Triton's
+[ensemble](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models)
+feature supports many use cases where multiple models are composed into a
+pipeline (or more generally a DAG, directed acyclic graph). However, there are
+many other use cases that are not supported because as part of the model
+pipeline they require loops, conditionals (if-then-else), data-dependent
+control-flow and other custom logic to be intermixed with model execution. We
+call this combination of custom logic and model executions *Business Logic
+Scripting (BLS)*.
+
+Starting from 21.08, you can implement BLS in your Python model. A new set of
+utility functions allows you to execute inference requests on other models
+being served by Triton as a part of executing your Python model. Note that BLS
+should only be used inside the `execute` function and is not supported
+in the `initialize` or `finalize` methods. Example below shows how to use this
+feature:
+
+```python
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+  ...
+    def execute(self, requests):
+      ...
+      # Create an InferenceRequest object. `model_name`,
+      # `requested_output_names`, and `inputs` are the required arguments and
+      # must be provided when constructing an InferenceRequest object. Make
+      # sure to replace `inputs` argument with a list of `pb_utils.Tensor`
+      # objects.
+      inference_request = pb_utils.InferenceRequest(
+          model_name='model_name',
+          requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'],
+          inputs=[<pb_utils.Tensor object>])
+
+      # `pb_utils.InferenceRequest` supports request_id, correlation_id,
+      # model version, timeout and preferred_memory in addition to the
+      # arguments described above.
+      # Note: Starting from the 24.03 release, the `correlation_id` parameter
+      # supports both string and unsigned integer values.
+      # These arguments are optional. An example containing all the arguments:
+      # inference_request = pb_utils.InferenceRequest(model_name='model_name',
+      #   requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'],
+      #   inputs=[<list of pb_utils.Tensor objects>],
+      #   request_id="1", correlation_id=4, model_version=1, flags=0, timeout=5,
+      #   preferred_memory=pb_utils.PreferredMemory(
+      #     pb_utils.TRITONSERVER_MEMORY_GPU, # or pb_utils.TRITONSERVER_MEMORY_CPU
+      #     0))
+
+      # Execute the inference_request and wait for the response
+      inference_response = inference_request.exec()
+
+      # Check if the inference response has an error
+      if inference_response.has_error():
+          raise pb_utils.TritonModelException(
+            inference_response.error().message())
+      else:
+          # Extract the output tensors from the inference response.
+          output1 = pb_utils.get_output_tensor_by_name(
+            inference_response, 'REQUESTED_OUTPUT_1')
+          output2 = pb_utils.get_output_tensor_by_name(
+            inference_response, 'REQUESTED_OUTPUT_2')
+
+          # Decide the next steps for model execution based on the received
+          # output tensors. It is possible to use the same output tensors
+          # to for the final inference response too.
+```
+
+
+In addition to the `inference_request.exec` function that allows you to
+execute blocking inference requests, `inference_request.async_exec` allows
+you to perform async inference requests. This can be useful when you do not
+need the result of the inference immediately. Using `async_exec` function, it
+is possible to have multiple inflight inference requests and wait for the
+responses only when needed. Example below shows how to use `async_exec`:
+
+```python
+import triton_python_backend_utils as pb_utils
+import asyncio
+
+
+class TritonPythonModel:
+  ...
+
+    # You must add the Python 'async' keyword to the beginning of `execute`
+    # function if you want to use `async_exec` function.
+    async def execute(self, requests):
+      ...
+      # Create an InferenceRequest object. `model_name`,
+      # `requested_output_names`, and `inputs` are the required arguments and
+      # must be provided when constructing an InferenceRequest object. Make
+      # sure to replace `inputs` argument with a list of `pb_utils.Tensor`
+      # objects.
+      inference_request = pb_utils.InferenceRequest(
+          model_name='model_name',
+          requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'],
+          inputs=[<pb_utils.Tensor object>])
+
+      infer_response_awaits = []
+      for i in range(4):
+        # async_exec function returns an
+        # [Awaitable](https://docs.python.org/3/library/asyncio-task.html#awaitables)
+        # object.
+        infer_response_awaits.append(inference_request.async_exec())
+
+      # Wait for all of the inference requests to complete.
+      infer_responses = await asyncio.gather(*infer_response_awaits)
+
+      for infer_response in infer_responses:
+        # Check if the inference response has an error
+        if inference_response.has_error():
+            raise pb_utils.TritonModelException(
+              inference_response.error().message())
+        else:
+            # Extract the output tensors from the inference response.
+            output1 = pb_utils.get_output_tensor_by_name(
+              inference_response, 'REQUESTED_OUTPUT_1')
+            output2 = pb_utils.get_output_tensor_by_name(
+              inference_response, 'REQUESTED_OUTPUT_2')
+
+            # Decide the next steps for model execution based on the received
+            # output tensors.
+```
+
+A complete example for sync and async BLS in Python backend is included in the
+[Examples](#examples) section.
+
+## Using BLS with Decoupled Models
+
+Starting from 23.03 release, you can execute inference requests on decoupled
+models in both [default mode](#default-mode) and
+[decoupled mode](#decoupled-mode). By setting the `decoupled` parameter to
+`True`, the `exec` and `async_exec` function will return an
+[iterator](https://docs.python.org/3/glossary.html#term-iterator) of
+inference responses returned by a decoupled model. If the `decoupled` parameter
+is set to `False`, the `exec` and `async_exec` function will return a single
+response as shown in the example above. Besides, you can set the timeout via
+the parameter 'timeout' in microseconds within the constructor of
+`InferenceRequest`. If the request times out, the request will respond with an
+error. The default of 'timeout' is 0 which indicates that the request has no
+timeout.
+
+Additionally, starting from the 23.04 release, you have the flexibility to
+select a specific device to receive output tensors from BLS calls. This
+can be achieved by setting the optional `preferred_memory` parameter within the
+`InferenceRequest` constructor. To do this, you can create a `PreferredMemory`
+object and specify the `preferred_memory_type` as either
+`TRITONSERVER_MEMORY_GPU` or `TRITONSERVER_MEMORY_CPU`, as well as the
+`preferred_device_id` as an integer to indicate the memory type and device ID
+on which you wish to receive output tensors. If you do not specify the
+`preferred_memory` parameter, the output tensors will be allocated on the
+same device where the output tensors were received from the model to which the
+BLS call is made.
+
+Example below shows how to use this feature:
+
+```python
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+  ...
+    def execute(self, requests):
+      ...
+      # Create an InferenceRequest object. `model_name`,
+      # `requested_output_names`, and `inputs` are the required arguments and
+      # must be provided when constructing an InferenceRequest object. Make
+      # sure to replace `inputs` argument with a list of `pb_utils.Tensor`
+      # objects.
+      inference_request = pb_utils.InferenceRequest(
+          model_name='model_name',
+          requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'],
+          inputs=[<pb_utils.Tensor object>])
+
+      # `pb_utils.InferenceRequest` supports request_id, correlation_id,
+      # model version, timeout and preferred_memory in addition to the
+      # arguments described above.
+      # Note: Starting from the 24.03 release, the `correlation_id` parameter
+      # supports both string and unsigned integer values.
+      # These arguments are optional. An example containing all the arguments:
+      # inference_request = pb_utils.InferenceRequest(model_name='model_name',
+      #   requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'],
+      #   inputs=[<list of pb_utils.Tensor objects>],
+      #   request_id="1", correlation_id="ex-4", model_version=1, flags=0, timeout=5,
+      #   preferred_memory=pb_utils.PreferredMemory(
+      #     pb_utils.TRITONSERVER_MEMORY_GPU, # or pb_utils.TRITONSERVER_MEMORY_CPU
+      #     0))
+
+      # Execute the inference_request and wait for the response. Here we are
+      # running a BLS request on a decoupled model, hence setting the parameter
+      # 'decoupled' to 'True'.
+      inference_responses = inference_request.exec(decoupled=True)
+
+      for inference_response in inference_responses:
+        # Check if the inference response has an error
+        if inference_response.has_error():
+            raise pb_utils.TritonModelException(
+              inference_response.error().message())
+
+        # For some models, it is possible that the last response is empty
+        if len(infer_response.output_tensors()) > 0:
+          # Extract the output tensors from the inference response.
+          output1 = pb_utils.get_output_tensor_by_name(
+            inference_response, 'REQUESTED_OUTPUT_1')
+          output2 = pb_utils.get_output_tensor_by_name(
+            inference_response, 'REQUESTED_OUTPUT_2')
+
+          # Decide the next steps for model execution based on the received
+          # output tensors. It is possible to use the same output tensors to
+          # for the final inference response too.
+```
+
+
+In addition to the `inference_request.exec(decoupled=True)` function that
+allows you to execute blocking inference requests on decoupled models,
+`inference_request.async_exec(decoupled=True)` allows you to perform async
+inference requests. This can be useful when you do not need the result of the
+inference immediately. Using `async_exec` function, it is possible to have
+multiple inflight inference requests and wait for the responses only when
+needed. Example below shows how to use `async_exec`:
+
+```python
+import triton_python_backend_utils as pb_utils
+import asyncio
+
+
+class TritonPythonModel:
+  ...
+
+    # You must add the Python 'async' keyword to the beginning of `execute`
+    # function if you want to use `async_exec` function.
+    async def execute(self, requests):
+      ...
+      # Create an InferenceRequest object. `model_name`,
+      # `requested_output_names`, and `inputs` are the required arguments and
+      # must be provided when constructing an InferenceRequest object. Make
+      # sure to replace `inputs` argument with a list of `pb_utils.Tensor`
+      # objects.
+      inference_request = pb_utils.InferenceRequest(
+          model_name='model_name',
+          requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'],
+          inputs=[<pb_utils.Tensor object>])
+
+      infer_response_awaits = []
+      for i in range(4):
+        # async_exec function returns an
+        # [Awaitable](https://docs.python.org/3/library/asyncio-task.html#awaitables)
+        # object.
+        infer_response_awaits.append(
+          inference_request.async_exec(decoupled=True))
+
+      # Wait for all of the inference requests to complete.
+      async_responses = await asyncio.gather(*infer_response_awaits)
+
+      for infer_responses in async_responses:
+        for infer_response in infer_responses:
+          # Check if the inference response has an error
+          if inference_response.has_error():
+              raise pb_utils.TritonModelException(
+                inference_response.error().message())
+
+          # For some models, it is possible that the last response is empty
+          if len(infer_response.output_tensors()) > 0:
+              # Extract the output tensors from the inference response.
+              output1 = pb_utils.get_output_tensor_by_name(
+                inference_response, 'REQUESTED_OUTPUT_1')
+              output2 = pb_utils.get_output_tensor_by_name(
+                inference_response, 'REQUESTED_OUTPUT_2')
+
+              # Decide the next steps for model execution based on the received
+              # output tensors.
+```
+
+A complete example for sync and async BLS for decoupled models is included in
+the [Examples](#examples) section.
+
+Note: Async BLS is not supported on Python 3.6 or lower due to the `async`
+keyword and `asyncio.run` being introduced in Python 3.7.
+
+Starting from the 22.04 release, the lifetime of the BLS output tensors have
+been improved such that if a tensor is no longer needed in your Python model it
+will be automatically deallocated. This can increase the number of BLS requests
+that you can execute in your model without running into the out of GPU or
+shared memory error.
+
+### Cancelling decoupled BLS requests
+A decoupled BLS inference request may be cancelled by calling the `cancel()`
+method on the response iterator returned from the method executing the BLS
+inference request. For example,
+
+```python
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+    ...
+    def execute(self, requests):
+        ...
+        bls_response_iterator = bls_request.exec(decoupled=True)
+        ...
+        bls_response_iterator.cancel()
+        ...
+```
+
+You may also call the `cancel()` method on the response iterator returned from
+the `async_exec()` method of the inference request. For example,
+
+```python
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+    ...
+    async def execute(self, requests):
+        ...
+        bls_response_iterator = await bls_request.async_exec(decoupled=True)
+        ...
+        bls_response_iterator.cancel()
+        ...
+```
+
+Note: Whether the decoupled model returns a cancellation error and stops executing
+the request depends on the model's backend implementation. Please refer to the
+documentation for more details [Handing in Backend](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/request_cancellation.md#handling-in-backend)
+
+## Model Loading API
+
+Starting from 23.07 release, you can use the model loading API to load models
+required by your BLS model. The model loading API is equivalent to the Triton C
+API for loading models which are documented in
+[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h).
+Below is an example of how to use the model loading API:
+
+```python
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.model_name="onnx_model"
+        # Check if the model is ready, and load the model if it is not ready.
+        # You can specify the model version in string format. The version is
+        # optional, and if not provided, the server will choose a version based
+        # on the model and internal policy.
+        if not pb_utils.is_model_ready(model_name=self.model_name,
+                                       model_version="1"):
+            # Load the model from the model repository
+            pb_utils.load_model(model_name=self.model_name)
+
+            # Load the model with an optional override model config in JSON
+            # representation. If provided, this config will be used for
+            # loading the model.
+            config = "{\"backend\":\"onnxruntime\", \"version_policy\":{\"specific\":{\"versions\":[1]}}}"
+            pb_utils.load_model(model_name=self.model_name, config=config)
+
+            # Load the mode with optional override files. The override files are
+            # specified as a dictionary where the key is the file path (with
+            # "file:" prefix) and the value is the file content as bytes. The
+            # files will form the model directory that the model will be loaded
+            # from. If specified, 'config' must be provided to be the model
+            # configuration of the override model directory.
+            with open('models/onnx_int32_int32_int32/1/model.onnx', 'rb') as file:
+                data = file.read()
+            files = {"file:1/model.onnx": data}
+            pb_utils.load_model(model_name=self.model_name,
+                                config=config, files=files)
+
+    def execute(self, requests):
+        # Execute the model
+        ...
+        # If the model is no longer needed, you can unload it. You can also
+        # specify whether the dependents of the model should also be unloaded by
+        # setting the 'unload_dependents' parameter to True. The default value
+        # is False. Need to be careful when unloading the model as it can affect
+        # other model instances or other models that depend on it.
+        pb_utils.unload_model(model_name=self.model_name,
+                              unload_dependents=True)
+
+```
+
+Note that the model loading API is only supported if the server is running in
+[explicit model control mode](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md#model-control-mode-explicit).
+Additionally, the model loading API should only be used after the server has
+been running, which means that the BLS model should not be loaded during server
+startup. You can use different
+[client endpoints](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md)
+to load the model after the server has been started. The model loading API is
+currently not supported during the `auto_complete_config` and `finalize`
+functions.
+
+## Using BLS with Stateful Models
+
+[Stateful models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#stateful-models)
+require setting additional flags in the inference request to indicate the
+start and end of a sequence. The `flags` argument in the `pb_utils.InferenceRequest`
+object can be used to indicate whether the request is the first or last request
+in the sequence. An example indicating that the request is starting the
+sequence:
+
+```python
+inference_request = pb_utils.InferenceRequest(model_name='model_name',
+  requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'],
+  inputs=[<list of pb_utils.Tensor objects>],
+  request_id="1", correlation_id=4,
+  flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START)
+```
+
+For indicating the ending of the sequence you can use the
+`pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END` flag. If the request is both
+starting and ending a sequence at the same time (i.e. the sequence has only a
+single request), you can use the bitwise OR operator to enable both of the
+flags:
+
+```
+flags = pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START | pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END
+```
+
+## Limitation
+
+- You need to make sure that the inference requests performed as a part of your
+model do not create a circular dependency. For example, if model A performs an
+inference request on itself and there are no more model instances ready to
+execute the inference request, the model will block on the inference execution
+forever.
+
+- Async BLS is not supported when running a Python model in decoupled mode.
+
+# Interoperability and GPU Support
+
+Starting from 21.09 release, Python backend supports
+[DLPack](https://github.com/dmlc/dlpack) for zero-copy transfer of Python
+backend tensors to other frameworks. The methods below are added to the
+`pb_utils.Tensor` object to facilitate the same:
+
+## `pb_utils.Tensor.to_dlpack() -> PyCapsule`
+
+This method can be called on existing instantiated tensors to convert
+a Tensor to DLPack. The code snippet below shows how this works with PyTorch:
+
+```python
+from torch.utils.dlpack import from_dlpack
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+
+  def execute(self, requests):
+    ...
+    input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+
+    # We have converted a Python backend tensor to a PyTorch tensor without
+    # making any copies.
+    pytorch_tensor = from_dlpack(input0.to_dlpack())
+```
+
+## `pb_utils.Tensor.from_dlpack() -> Tensor`
+
+This static method can be used for creating a `Tensor` object from the DLPack
+encoding of the tensor. For example:
+
+```python
+from torch.utils.dlpack import to_dlpack
+import torch
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+
+  def execute(self, requests):
+    ...
+    pytorch_tensor = torch.tensor([1, 2, 3], device='cuda')
+
+    # Create a Python backend tensor from the DLPack encoding of a PyTorch
+    # tensor.
+    input0 = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(pytorch_tensor))
+```
+Python backend allows tensors implementing
+[`__dlpack__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack__.html)
+and [`__dlpack_device__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack_device__.html)
+[interface](https://dmlc.github.io/dlpack/latest/python_spec.html)
+to be converted to Python backend tensors. For instance:
+
+```python
+input0 = pb_utils.Tensor.from_dlpack("INPUT0", pytorch_tensor)
+```
+
+This method only supports contiguous Tensors that are in C-order. If the tensor
+is not C-order contiguous an exception will be raised.
+
+For python models with input or output tensors of type BFloat16 (BF16), the
+`as_numpy()` method is not supported, and the `from_dlpack` and `to_dlpack`
+methods must be used instead.
+
+## `pb_utils.Tensor.is_cpu() -> bool`
+
+This function can be used to check whether a tensor is placed in CPU or not.
+
+## Input Tensor Device Placement
+
+By default, the Python backend moves all input tensors to CPU before providing
+them to the Python model. Starting from 21.09, you can change this default
+behavior. By setting `FORCE_CPU_ONLY_INPUT_TENSORS` to "no", Triton will not
+move input tensors to CPU for the Python model. Instead, Triton will provide the
+input tensors to the Python model in either CPU or GPU memory, depending on how
+those tensors were last used. You cannot predict which memory will be used for
+each input tensor so your Python model must be able to handle tensors in both
+CPU and GPU memory. To enable this setting, you need to add this setting to the
+`parameters` section of model configuration:
+
+```
+parameters: { key: "FORCE_CPU_ONLY_INPUT_TENSORS" value: {string_value:"no"}}
+```
+
+# Frameworks
+
+Since Python Backend models can support most python packages, it is a common
+workflow for users to use Deep Learning Frameworks like PyTorch in their
+`model.py` implementation. This section will document some notes and FAQ about
+this workflow.
+
+> **Note**
+>
+> Using a deep learning framework/package in a Python Backend model is
+> not necessarily the same as using the corresponding Triton Backend
+> implementation. For example, the
+> [PyTorch Backend](https://github.com/triton-inference-server/pytorch_backend)
+> is different from using a Python Backend model that uses `import torch`.
+> If you are seeing significantly different results from a model executed by
+> the framework (ex: PyTorch) compared to the Python Backend model running the
+> same framework, some of the first things you should check is that the
+> framework versions being used and the input/output preparation are the same.
+
+## PyTorch
+
+For a simple example of using PyTorch in a Python Backend model, see the
+[AddSubNet PyTorch example](#addsubnet-in-pytorch).
+
+### PyTorch Determinism
+
+When running PyTorch code, you may notice slight differences in output values
+across runs or across servers depending on hardware, system load, driver, or even
+batch size. These differences are generally related to the selection of CUDA
+kernels used to execute the operations, based on the factors mentioned.
+
+For most intents and purposes, these differences aren't large enough to affect
+a model's final prediction. However, to understand where these differences come
+from, see this [doc](https://pytorch.org/docs/stable/notes/randomness.html).
+
+On Ampere devices and later, there is an optimization related to
+FP32 operations called
+[TensorFloat32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/).
+Typically this optimization will improve overall performance at the cost of
+minor precision loss, but similarly this precision loss is acceptable for most
+model predictions. For more info on TF32 in PyTorch and how to enable/disable
+it as needed, see
+[here](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices).
+
+## TensorFlow
+
+### TensorFlow Determinism
+
+Similar to the PyTorch determinism section above, TensorFlow can have slight
+differences in outputs based on various factors like hardware, system
+configurations, or batch sizes due to the library's internal CUDA kernel
+selection process. For more information on improving the determinism of outputs
+in TensorFlow, see
+[here](https://www.tensorflow.org/api_docs/python/tf/config/experimental/enable_op_determinism).
+
+# Custom Metrics
+
+Starting from 23.05, you can utlize Custom Metrics API to register and collect
+custom metrics in the `initialize`, `execute`, and `finalize` functions of your
+Python model. The Custom Metrics API is the Python equivalent of the
+[TRITON C API custom metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md#custom-metrics)
+support. You will need to take the ownership of the custom metrics created
+through the APIs and must manage their lifetime. Note that a `MetricFamily`
+object should be deleted only after all the `Metric` objects under it are
+deleted if you'd like to explicitly delete the custom metrics objects.
+
+Example below shows how to use this feature:
+
+```python
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+      # Create a MetricFamily object to report the latency of the model
+      # execution. The 'kind' parameter must be either 'COUNTER',
+      # 'GAUGE' or 'HISTOGRAM'.
+      self.metric_family = pb_utils.MetricFamily(
+          name="preprocess_latency_ns",
+          description="Cumulative time spent pre-processing requests",
+          kind=pb_utils.MetricFamily.COUNTER
+      )
+
+      # Create a Metric object under the MetricFamily object. The 'labels'
+      # is a dictionary of key-value pairs.
+      self.metric = self.metric_family.Metric(
+        labels={"model" : "model_name", "version" : "1"}
+      )
+
+    def execute(self, requests):
+      responses = []
+
+      for request in requests:
+        # Pre-processing - time it to capture latency
+        start_ns = time.time_ns()
+        self.preprocess(request)
+        end_ns = time.time_ns()
+
+        # Update metric to track cumulative pre-processing latency
+        self.metric.increment(end_ns - start_ns)
+
+      ...
+
+        print("Cumulative pre-processing latency:", self.metric.value())
+
+      return responses
+```
+
+You can look at the [custom_metrics example](examples/custom_metrics/model.py)
+which contains a complete example of demonstrating the Custom Metrics API for a
+Python model.
+
 # Examples
 
 For using the Triton Python client in these examples you need to install
-the [Triton Python Client Library](https://github.com/triton-inference-server/client#getting-the-client-libraries-and-examples).
+the
+[Triton Python Client Library](https://github.com/triton-inference-server/client#getting-the-client-libraries-and-examples).
 The Python client for each of the examples is in the `client.py` file.
 
-## AddSub in Numpy
+## AddSub in NumPy
 
-There is no dependencies required for the AddSub numpy example. Instructions
+There is no dependencies required for the AddSub NumPy example. Instructions
 on how to use this model is explained in the quick start section. You can
 find the files in [examples/add_sub](examples/add_sub).
 
 ## AddSubNet in PyTorch
 
 In order to use this model, you need to install PyTorch. We recommend using
-`pip` method mentioned in the [PyTorch
-website](https://pytorch.org/get-started/locally/). Make sure that PyTorch is
-available in the same Python environment as other dependencies. If you need
-to create another Python environment, please refer to the "Changing Python
-Runtime Path" section of this readme. You can find the files for this example
-in [examples/pytorch](examples/pytorch).
+`pip` method mentioned in the
+[PyTorch website](https://pytorch.org/get-started/locally/).
+Make sure that PyTorch is available in the same Python environment as other
+dependencies. Alternatively, you can create a
+[Python Execution Environment](#creating-custom-execution-environments).
+You can find the files for this example in [examples/pytorch](examples/pytorch).
+
+## AddSub in JAX
+
+The JAX example shows how to serve JAX in Triton using Python Backend.
+You can find the complete example instructions in
+[examples/jax](examples/jax/README.md).
+
+## Business Logic Scripting
+
+The BLS example needs the dependencies required for both of the above examples.
+You can find the complete example instructions in
+[examples/bls](examples/bls/README.md) and
+[examples/bls_decoupled](examples/bls_decoupled/README.md).
+
+## Preprocessing
+
+The Preprocessing example shows how to use Python Backend to do model
+preprocessing.
+You can find the complete example instructions in
+[examples/preprocessing](examples/preprocessing/README.md).
+
+## Decoupled Models
+
+The examples of decoupled models shows how to develop and serve
+[decoupled models](#decoupled-mode) in Triton using Python backend.
+You can find the complete example instructions in
+[examples/decoupled](examples/decoupled/README.md).
+
+## Model Instance Kind
+
+Triton model configuration allows users to provide kind to [instance group
+settings.](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
+A python backend model can be written to respect the kind setting to control
+the execution of a model instance either on CPU or GPU.
+
+In the [model instance kind example](examples/instance_kind/README.md)
+we demonstrate how this can be achieved for your python model.
+
+## Auto-complete config
+
+The auto-complete config example demonstrates how to use the
+`auto_complete_config` function to define
+[minimal model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#minimal-model-configuration)
+when a configuration file is not available. You can find the complete example
+instructions in [examples/auto_complete](examples/auto_complete/README.md).
+
+## Custom Metrics
+
+The example shows how to use custom metrics API in Python Backend. You can find
+the complete example instructions in
+[examples/custom_metrics](examples/custom_metrics/README.md).
+
+# Running with Inferentia
+
+Please see the
+[README.md](https://github.com/triton-inference-server/python_backend/tree/main/inferentia/README.md)
+located in the python_backend/inferentia sub folder.
+
+# Logging
+
+Starting from 22.09 release, your Python model can log information using the
+following methods:
+
+```python
+import triton_python_backend_utils as pb_utils
+
+class TritonPythonModel:
+
+  def execute(self, requests):
+    ...
+    logger = pb_utils.Logger
+    logger.log_info("Info Msg!")
+    logger.log_warn("Warning Msg!")
+    logger.log_error("Error Msg!")
+    logger.log_verbose("Verbose Msg!")
+
+```
+*Note:* The logger can be defined and used in following class methods:
+
+* initialize
+* execute
+* finalize
+
+Log messages can also be sent with their log-level explcitiy specified:
+```python
+# log-level options: INFO, WARNING, ERROR, VERBOSE
+logger.log("Specific Msg!", logger.INFO)
+```
+If no log-level is specified, this method will log INFO level messages.
+
+Note that the Triton server's settings determine which log messages appear
+within the server log. For example, if a model attempts to log a verbose-level
+message, but Triton is not set to log verbose-level messages, it will not
+appear in the server log. For more information on Triton's log settings and
+how to adjust them dynamically, please see Triton's
+[logging extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_logging.md)
+documentation.
+
+# Adding Custom Parameters in the Model Configuration
+
+If your model requires custom parameters in the configuration, you can specify
+that in the `parameters` section of the model config. For example:
+
+```
+parameters {
+  key: "custom_key"
+  value: {
+    string_value: "custom_value"
+  }
+}
+```
+
+Now you can access this parameter in the `args` argument of the `initialize`
+function:
+
+```python
+def initialize(self, args):
+    print(json.loads(args['model_config'])['parameters'])
+    # Should print {'custom_key': {'string_value': 'custom_value'}}
+```
+
+# Development with VSCode
+
+The repository includes a `.devcontainer` folder that contains a `Dockerfile`
+and `devcontainer.json` file to help you develop the Python backend
+using
+[Visual Studio Code](https://code.visualstudio.com/docs/devcontainers/containers).
+
+In order to build the backend, you can execute the "Build Python Backend" task in the
+[VSCode tasks](https://code.visualstudio.com/docs/editor/tasks). This will build
+the Python backend and install the artifacts in
+`/opt/tritonserver/backends/python`.
+
 
 # Reporting problems, asking questions
 
diff --git a/cmake/TritonPythonBackendConfig.cmake.in b/cmake/TritonPythonBackendConfig.cmake.in
index 2ab3af1a..4869a6df 100644
--- a/cmake/TritonPythonBackendConfig.cmake.in
+++ b/cmake/TritonPythonBackendConfig.cmake.in
@@ -1,4 +1,4 @@
-  
+
 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/add_sub/client.py b/examples/add_sub/client.py
index 6e66b70e..80fc4133 100644
--- a/examples/add_sub/client.py
+++ b/examples/add_sub/client.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,11 +24,11 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from tritonclient.utils import *
-import tritonclient.grpc as grpcclient
-import tritonclient.http as httpclient
+import sys
 
 import numpy as np
+import tritonclient.http as httpclient
+from tritonclient.utils import *
 
 model_name = "add_sub"
 shape = [4]
@@ -37,10 +37,12 @@
     input0_data = np.random.rand(*shape).astype(np.float32)
     input1_data = np.random.rand(*shape).astype(np.float32)
     inputs = [
-        httpclient.InferInput("INPUT0", input0_data.shape,
-                              np_to_triton_dtype(input0_data.dtype)),
-        httpclient.InferInput("INPUT1", input1_data.shape,
-                              np_to_triton_dtype(input1_data.dtype)),
+        httpclient.InferInput(
+            "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype)
+        ),
+        httpclient.InferInput(
+            "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype)
+        ),
     ]
 
     inputs[0].set_data_from_numpy(input0_data)
@@ -51,13 +53,30 @@
         httpclient.InferRequestedOutput("OUTPUT1"),
     ]
 
-    response = client.infer(model_name,
-                            inputs,
-                            request_id=str(1),
-                            outputs=outputs)
+    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
 
     result = response.get_response()
-    print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
-        input0_data, input1_data, response.as_numpy("OUTPUT0")))
-    print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format(
-        input0_data, input1_data, response.as_numpy("OUTPUT1")))
+    output0_data = response.as_numpy("OUTPUT0")
+    output1_data = response.as_numpy("OUTPUT1")
+
+    print(
+        "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
+            input0_data, input1_data, output0_data
+        )
+    )
+    print(
+        "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format(
+            input0_data, input1_data, output1_data
+        )
+    )
+
+    if not np.allclose(input0_data + input1_data, output0_data):
+        print("add_sub example error: incorrect sum")
+        sys.exit(1)
+
+    if not np.allclose(input0_data - input1_data, output1_data):
+        print("add_sub example error: incorrect difference")
+        sys.exit(1)
+
+    print("PASS: add_sub")
+    sys.exit(0)
diff --git a/examples/add_sub/config.pbtxt b/examples/add_sub/config.pbtxt
index b85726da..0a932770 100644
--- a/examples/add_sub/config.pbtxt
+++ b/examples/add_sub/config.pbtxt
@@ -32,7 +32,6 @@ input [
     name: "INPUT0"
     data_type: TYPE_FP32
     dims: [ 4 ]
-    
   }
 ]
 input [
@@ -40,7 +39,6 @@ input [
     name: "INPUT1"
     data_type: TYPE_FP32
     dims: [ 4 ]
-    
   }
 ]
 output [
diff --git a/examples/add_sub/model.py b/examples/add_sub/model.py
index 369cab12..f416e79d 100644
--- a/examples/add_sub/model.py
+++ b/examples/add_sub/model.py
@@ -24,8 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
-import sys
 import json
 
 # triton_python_backend_utils is available in every Triton Python model. You
@@ -43,7 +41,7 @@ class TritonPythonModel:
     def initialize(self, args):
         """`initialize` is called only once when the model is being loaded.
         Implementing `initialize` function is optional. This function allows
-        the model to intialize any state associated with this model.
+        the model to initialize any state associated with this model.
 
         Parameters
         ----------
@@ -58,21 +56,21 @@ def initialize(self, args):
         """
 
         # You must parse model_config. JSON string is not parsed here
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
         # Get OUTPUT0 configuration
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
 
         # Get OUTPUT1 configuration
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         # Convert Triton types to numpy types
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
     def execute(self, requests):
         """`execute` MUST be implemented in every Python model. `execute`
@@ -109,15 +107,15 @@ def execute(self, requests):
             # Get INPUT1
             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
 
-            out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
-                            in_0.as_numpy() - in_1.as_numpy())
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
 
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
-            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                           out_0.astype(output0_dtype))
-            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                           out_1.astype(output1_dtype))
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
 
             # Create InferenceResponse. You can set an error here in case
             # there was a problem with handling this inference request.
@@ -125,9 +123,10 @@ def execute(self, requests):
             # response:
             #
             # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occured"))
+            #    output_tensors=..., TritonError("An error occurred"))
             inference_response = pb_utils.InferenceResponse(
-                output_tensors=[out_tensor_0, out_tensor_1])
+                output_tensors=[out_tensor_0, out_tensor_1]
+            )
             responses.append(inference_response)
 
         # You should return a list of pb_utils.InferenceResponse. Length
@@ -139,4 +138,4 @@ def finalize(self):
         Implementing `finalize` function is OPTIONAL. This function allows
         the model to perform any necessary clean ups before exit.
         """
-        print('Cleaning up...')
+        print("Cleaning up...")
diff --git a/examples/auto_complete/README.md b/examples/auto_complete/README.md
new file mode 100644
index 00000000..b07e065c
--- /dev/null
+++ b/examples/auto_complete/README.md
@@ -0,0 +1,107 @@
+<!--
+# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Auto-Complete Example
+
+This example shows how to implement
+[`auto_complete_config`](https://github.com/triton-inference-server/python_backend/#auto_complete_config)
+function in Python backend to provide
+[`max_batch_size`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#maximum-batch-size),
+[`input`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs)
+and [`output`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs)
+properties. These properties will allow Triton to load the Python model with
+[Minimal Model Configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#minimal-model-configuration)
+in absence of a configuration file.
+
+The
+[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md)
+should contain [nobatch_auto_complete](./nobatch_model.py), and
+[batch_auto_complete](./batch_model.py) models.
+The max_batch_size of [nobatch_auto_complete](./nobatch_model.py) model is set
+to zero, whereas the max_batch_size of [batch_auto_complete](./batch_model.py)
+model is set to 4. For models with a non-zero value of max_batch_size, the
+configuration can specify a different value of max_batch_size as long as it
+does not exceed the value set in the model file.
+
+The
+[nobatch_auto_complete](./nobatch_model.py) and
+[batch_auto_complete](./batch_model.py) models calculate the sum and difference
+of the `INPUT0` and `INPUT1` and put the results in `OUTPUT0` and `OUTPUT1`
+respectively.
+
+## Deploying the Auto-Complete Models
+
+1. Create the model repository:
+
+```console
+mkdir -p models/nobatch_auto_complete/1/
+mkdir -p models/batch_auto_complete/1/
+
+# Copy the Python models
+cp examples/auto_complete/nobatch_model.py models/nobatch_auto_complete/1/model.py
+cp examples/auto_complete/batch_model.py models/batch_auto_complete/1/model.py
+```
+**Note that we don't need a model configuration file since Triton will use the
+auto-complete model configuration provided in the Python model.**
+
+2. Start the tritonserver:
+
+```
+tritonserver --model-repository `pwd`/models
+```
+
+## Running inferences on Nobatch and Batch models:
+
+Send inference requests using [client.py](./client.py).
+
+```
+python3 examples/auto_complete/client.py
+```
+
+You should see an output similar to the output below:
+
+```
+'nobatch_auto_complete' configuration matches the expected auto complete configuration
+
+'batch_auto_complete' configuration matches the expected auto complete configuration
+
+PASS: auto_complete
+
+```
+
+The [nobatch_model.py](./nobatch_model.py) and [batch_model.py](./batch_model.py)
+model files are heavily commented with explanations about how to utilize
+`set_max_batch_size`, `add_input`, and `add_output`functions to set
+`max_batch_size`, `input` and `output` properties of the model.
+
+### Explanation of the Client Output
+
+For each model, the [client.py](./client.py) first requests the model
+configuration from Triton to validate if the model configuration has been
+registered as expected. The client then sends an inference request to verify
+whether the inference has run properly and the result is correct.
diff --git a/examples/auto_complete/batch_model.py b/examples/auto_complete/batch_model.py
new file mode 100644
index 00000000..98fa06f5
--- /dev/null
+++ b/examples/auto_complete/batch_model.py
@@ -0,0 +1,212 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        """`auto_complete_config` is called only once when loading the model assuming
+        the server was not started with `--disable-auto-complete-config`. Implementing
+        this function is optional. No implementation of `auto_complete_config` will
+        do nothing. This function can be used to set `max_batch_size`, `input` and
+        `output` properties of the model using `set_max_batch_size`, `add_input`, and
+        `add_output`. These properties will allow Triton to load the model with minimal
+        model configuration in absence of a configuration file. This function returns
+        the `pb_utils.ModelConfig` object with these properties. You can use the `as_dict`
+        function to gain read-only access to the `pb_utils.ModelConfig` object.
+        The `pb_utils.ModelConfig` object being returned from here will be used as
+        the final configuration for the model.
+
+        Note: The Python interpreter used to invoke this function will be
+        destroyed upon returning from this function and as a result none of
+        the objects created here will be available in the `initialize`,
+        `execute`, or `finalize` functions.
+
+        Parameters
+        ----------
+        auto_complete_model_config : pb_utils.ModelConfig
+          An object containing the existing model configuration. You can build
+          upon the configuration given by this object when setting the
+          properties for this model.
+
+        Returns
+        -------
+        pb_utils.ModelConfig
+          An object containing the auto-completed model configuration
+        """
+        inputs = [
+            {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]},
+            {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]},
+        ]
+        outputs = [
+            {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]},
+            {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]},
+        ]
+
+        # Demonstrate the usage of `as_dict`, `add_input`, `add_output`,
+        # and `set_max_batch_size` functions.
+        # Store the model configuration as a dictionary.
+        config = auto_complete_model_config.as_dict()
+        input_names = []
+        output_names = []
+        for input in config["input"]:
+            input_names.append(input["name"])
+        for output in config["output"]:
+            output_names.append(output["name"])
+
+        for input in inputs:
+            # The name checking here is only for demonstrating the usage of
+            # `as_dict` function. `add_input` will check for conflicts and
+            # raise errors if an input with the same name already exists in
+            # the configuration but has different data_type or dims property.
+            if input["name"] not in input_names:
+                auto_complete_model_config.add_input(input)
+        for output in outputs:
+            # The name checking here is only for demonstrating the usage of
+            # `as_dict` function. `add_output` will check for conflicts and
+            # raise errors if an output with the same name already exists in
+            # the configuration but has different data_type or dims property.
+            if output["name"] not in output_names:
+                auto_complete_model_config.add_output(output)
+
+        auto_complete_model_config.set_max_batch_size(4)
+        auto_complete_model_config.set_dynamic_batching()
+
+        return auto_complete_model_config
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args["model_config"])
+
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+
+        # Get OUTPUT1 configuration
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
+
+        # Convert Triton types to numpy types
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config["data_type"]
+        )
+        self.output1_dtype = pb_utils.triton_string_to_numpy(
+            output1_config["data_type"]
+        )
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        output0_dtype = self.output0_dtype
+        output1_dtype = self.output1_dtype
+
+        responses = []
+
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for request in requests:
+            # Get INPUT0
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            # Get INPUT1
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
+
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[out_tensor_0, out_tensor_1]
+            )
+            responses.append(inference_response)
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/examples/auto_complete/client.py b/examples/auto_complete/client.py
new file mode 100644
index 00000000..24fc1fac
--- /dev/null
+++ b/examples/auto_complete/client.py
@@ -0,0 +1,83 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+import numpy as np
+import tritonclient.http as httpclient
+from tritonclient.utils import *
+
+nobatch_model_name = "nobatch_auto_complete"
+batch_model_name = "batch_auto_complete"
+
+
+def validate_ios(config, expected_ios, model_name):
+    for io in config:
+        for expected_io in expected_ios:
+            if io["name"] == expected_io["name"]:
+                if io["data_type"] != expected_io["data_type"]:
+                    print("model '" + model_name + "' has unexpected data_type")
+                    sys.exit(1)
+                elif io["dims"] != expected_io["dims"]:
+                    print("model '" + model_name + "' has unexpected dims")
+                    sys.exit(1)
+
+
+if __name__ == "__main__":
+    with httpclient.InferenceServerClient("localhost:8000") as client:
+        expected_max_batch_size = {
+            "nobatch_auto_complete": 0,
+            "batch_auto_complete": 4,
+        }
+        expected_inputs = [
+            {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]},
+            {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]},
+        ]
+        expected_outputs = [
+            {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]},
+            {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]},
+        ]
+
+        models = [nobatch_model_name, batch_model_name]
+
+        for model_name in models:
+            # Validate the auto-complete model configuration
+            model_config = client.get_model_config(model_name)
+            if model_config["max_batch_size"] != expected_max_batch_size[model_name]:
+                print("model '" + model_name + "' has unexpected max_batch_size")
+                sys.exit(1)
+            validate_ios(model_config["input"], expected_inputs, model_name)
+            validate_ios(model_config["output"], expected_outputs, model_name)
+            print(
+                "'"
+                + model_name
+                + "' configuration matches the expected "
+                + "auto complete configuration\n"
+            )
+
+    print("PASS: auto_complete")
+
+    sys.exit(0)
diff --git a/examples/auto_complete/nobatch_model.py b/examples/auto_complete/nobatch_model.py
new file mode 100644
index 00000000..6e875138
--- /dev/null
+++ b/examples/auto_complete/nobatch_model.py
@@ -0,0 +1,211 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        """`auto_complete_config` is called only once when loading the model assuming
+        the server was not started with `--disable-auto-complete-config`. Implementing
+        this function is optional. No implementation of `auto_complete_config` will
+        do nothing. This function can be used to set `max_batch_size`, `input` and
+        `output` properties of the model using `set_max_batch_size`, `add_input`, and
+        `add_output`. These properties will allow Triton to load the model with minimal
+        model configuration in absence of a configuration file. This function returns
+        the `pb_utils.ModelConfig` object with these properties. You can use the `as_dict`
+        function to gain read-only access to the `pb_utils.ModelConfig` object.
+        The `pb_utils.ModelConfig` object being returned from here will be used as
+        the final configuration for the model.
+
+        Note: The Python interpreter used to invoke this function will be
+        destroyed upon returning from this function and as a result none of
+        the objects created here will be available in the `initialize`,
+        `execute`, or `finalize` functions.
+
+        Parameters
+        ----------
+        auto_complete_model_config : pb_utils.ModelConfig
+          An object containing the existing model configuration. You can build
+          upon the configuration given by this object when setting the
+          properties for this model.
+
+        Returns
+        -------
+        pb_utils.ModelConfig
+          An object containing the auto-completed model configuration
+        """
+        inputs = [
+            {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]},
+            {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]},
+        ]
+        outputs = [
+            {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]},
+            {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]},
+        ]
+
+        # Demonstrate the usage of `as_dict`, `add_input`, `add_output`,
+        # and `set_max_batch_size` functions.
+        # Store the model configuration as a dictionary.
+        config = auto_complete_model_config.as_dict()
+        input_names = []
+        output_names = []
+        for input in config["input"]:
+            input_names.append(input["name"])
+        for output in config["output"]:
+            output_names.append(output["name"])
+
+        for input in inputs:
+            # The name checking here is only for demonstrating the usage of
+            # `as_dict` function. `add_input` will check for conflicts and
+            # raise errors if an input with the same name already exists in
+            # the configuration but has different data_type or dims property.
+            if input["name"] not in input_names:
+                auto_complete_model_config.add_input(input)
+        for output in outputs:
+            # The name checking here is only for demonstrating the usage of
+            # `as_dict` function. `add_output` will check for conflicts and
+            # raise errors if an output with the same name already exists in
+            # the configuration but has different data_type or dims property.
+            if output["name"] not in output_names:
+                auto_complete_model_config.add_output(output)
+
+        auto_complete_model_config.set_max_batch_size(0)
+
+        return auto_complete_model_config
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args["model_config"])
+
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+
+        # Get OUTPUT1 configuration
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
+
+        # Convert Triton types to numpy types
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config["data_type"]
+        )
+        self.output1_dtype = pb_utils.triton_string_to_numpy(
+            output1_config["data_type"]
+        )
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        output0_dtype = self.output0_dtype
+        output1_dtype = self.output1_dtype
+
+        responses = []
+
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for request in requests:
+            # Get INPUT0
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            # Get INPUT1
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
+
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[out_tensor_0, out_tensor_1]
+            )
+            responses.append(inference_response)
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/examples/bls/README.md b/examples/bls/README.md
new file mode 100644
index 00000000..b12ec298
--- /dev/null
+++ b/examples/bls/README.md
@@ -0,0 +1,157 @@
+<!--
+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# BLS Example
+
+In this section we demonstrate an end-to-end example for
+[BLS](../../README.md#business-logic-scripting) in Python backend. The
+[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md)
+should contain [pytorch](../pytorch), [addsub](../add_sub).  The
+[pytorch](../pytorch) and [addsub](../add_sub) models calculate the sum and
+difference of the `INPUT0` and `INPUT1` and put the results in `OUTPUT0` and
+`OUTPUT1` respectively. This example is broken into two sections. The first
+section demonstrates how to perform synchronous BLS requests and the second
+section shows how to execute asynchronous BLS requests.
+
+## Synchronous BLS Requests
+
+The goal of sync BLS model is the same as [pytorch](../pytorch) and
+[addsub](../add_sub) models but the difference is that the BLS model will not
+calculate the sum and difference by itself. The sync BLS model will pass the
+input tensors to the [pytorch](../pytorch) or [addsub](../add_sub) models and
+return the responses of that model as the final response. The additional
+parameter `MODEL_NAME` determines which model will be used for calculating the
+final outputs.
+
+1. Create the model repository:
+
+```console
+mkdir -p models/add_sub/1
+mkdir -p models/bls_sync/1
+mkdir -p models/pytorch/1
+
+# Copy the Python models
+cp examples/add_sub/model.py models/add_sub/1/
+cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt
+cp examples/bls/sync_model.py models/bls_sync/1/model.py
+cp examples/bls/sync_config.pbtxt models/bls_sync/config.pbtxt
+cp examples/pytorch/model.py models/pytorch/1/
+cp examples/pytorch/config.pbtxt models/pytorch/
+```
+
+2. Start the tritonserver:
+
+```
+tritonserver --model-repository `pwd`/models
+```
+
+3. Send inference requests to server:
+
+```
+python3 examples/bls/sync_client.py
+```
+
+You should see an output similar to the output below:
+
+```
+=========='add_sub' model result==========
+INPUT0 ([0.34984654 0.6808792  0.6509772  0.6211422 ]) + INPUT1 ([0.37917137 0.9080451  0.60789365 0.33425143]) = OUTPUT0 ([0.7290179 1.5889243 1.2588708 0.9553937])
+INPUT0 ([0.34984654 0.6808792  0.6509772  0.6211422 ]) - INPUT1 ([0.37917137 0.9080451  0.60789365 0.33425143]) = OUTPUT1 ([-0.02932483 -0.22716594  0.04308355  0.28689077])
+
+
+=========='pytorch' model result==========
+INPUT0 ([0.34984654 0.6808792  0.6509772  0.6211422 ]) + INPUT1 ([0.37917137 0.9080451  0.60789365 0.33425143]) = OUTPUT0 ([0.7290179 1.5889243 1.2588708 0.9553937])
+INPUT0 ([0.34984654 0.6808792  0.6509772  0.6211422 ]) - INPUT1 ([0.37917137 0.9080451  0.60789365 0.33425143]) = OUTPUT1 ([-0.02932483 -0.22716594  0.04308355  0.28689077])
+
+
+=========='undefined' model result==========
+Failed to process the request(s) for model instance 'bls_0', message: TritonModelException: Failed for execute the inference request. Model 'undefined_model' is not ready.
+
+At:
+  /tmp/python_backend/models/bls/1/model.py(110): execute
+```
+
+The [sync_model.py](./sync_model.py) model file is heavily commented with
+explanations about each of the function calls.
+
+### Explanation of the Client Output
+
+The [client.py](./sync_client.py) sends three inference requests to the 'bls_sync'
+model with different values for the "MODEL_NAME" input. As explained earlier,
+"MODEL_NAME" determines the model name that the "bls" model will use for
+calculating the final outputs. In the first request, it will use the "add_sub"
+model and in the second request it will use the "pytorch" model. The third
+request uses an incorrect model name to demonstrate error handling during
+the inference request execution.
+
+## Asynchronous BLS Requests
+
+In this section we explain how to send multiple BLS requests without waiting for
+their response. Asynchronous execution of BLS requests will not block your
+model execution and can lead to speedups under certain conditions.
+
+The `bls_async` model will perform two async BLS requests on the
+[pytorch](../pytorch) and [addsub](../add_sub) models. Then, it will wait until
+the inference requests on these models is completed. It will extract `OUTPUT0`
+from the [pytorch](../pytorch) and `OUTPUT1` from the [addsub](../add_sub) model
+to construct the final inference response object using these tensors.
+
+1. Create the model repository:
+
+```console
+mkdir -p models/add_sub/1
+mkdir -p models/bls_async/1
+mkdir -p models/pytorch/1
+
+# Copy the Python models
+cp examples/add_sub/model.py models/add_sub/1/
+cp examples/add_sub/config.pbtxt models/add_sub/
+cp examples/bls/async_model.py models/bls_async/1/model.py
+cp examples/bls/async_config.pbtxt models/bls_async/config.pbtxt
+cp examples/pytorch/model.py models/pytorch/1/
+cp examples/pytorch/config.pbtxt models/pytorch/
+```
+
+2. Start the tritonserver:
+
+```
+tritonserver --model-repository `pwd`/models
+```
+
+3. Send inference requests to server:
+
+```
+python3 examples/bls/async_client.py
+```
+
+You should see an output similar to the output below:
+
+```
+INPUT0 ([0.72394824 0.45873794 0.4307444  0.07681174]) + INPUT1 ([0.34224355 0.8271524  0.5831284  0.904624  ]) = OUTPUT0 ([1.0661918 1.2858903 1.0138729 0.9814357])
+INPUT0 ([0.72394824 0.45873794 0.4307444  0.07681174]) - INPUT1 ([0.34224355 0.8271524  0.5831284  0.904624  ]) = OUTPUT1 ([ 0.3817047  -0.36841443 -0.15238398 -0.82781225])
+```
diff --git a/examples/bls/async_client.py b/examples/bls/async_client.py
new file mode 100644
index 00000000..6d8fe577
--- /dev/null
+++ b/examples/bls/async_client.py
@@ -0,0 +1,82 @@
+# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+import numpy as np
+import tritonclient.http as httpclient
+from tritonclient.utils import *
+
+model_name = "bls_async"
+shape = [4]
+
+with httpclient.InferenceServerClient("localhost:8000") as client:
+    input0_data = np.random.rand(*shape).astype(np.float32)
+    input1_data = np.random.rand(*shape).astype(np.float32)
+    inputs = [
+        httpclient.InferInput(
+            "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype)
+        ),
+        httpclient.InferInput(
+            "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype)
+        ),
+    ]
+
+    inputs[0].set_data_from_numpy(input0_data)
+    inputs[1].set_data_from_numpy(input1_data)
+
+    outputs = [
+        httpclient.InferRequestedOutput("OUTPUT0"),
+        httpclient.InferRequestedOutput("OUTPUT1"),
+    ]
+
+    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
+
+    result = response.get_response()
+    output0_data = response.as_numpy("OUTPUT0")
+    output1_data = response.as_numpy("OUTPUT1")
+
+    print(
+        "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
+            input0_data, input1_data, output0_data
+        )
+    )
+    print(
+        "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format(
+            input0_data, input1_data, output1_data
+        )
+    )
+
+    if not np.allclose(input0_data + input1_data, output0_data):
+        print("BLS async example error: incorrect sum")
+        sys.exit(1)
+
+    if not np.allclose(input0_data - input1_data, output1_data):
+        print("BLS async example error: incorrect difference")
+        sys.exit(1)
+
+    print("PASS: BLS Async")
+    sys.exit(0)
diff --git a/examples/bls/async_config.pbtxt b/examples/bls/async_config.pbtxt
new file mode 100644
index 00000000..1a4144a3
--- /dev/null
+++ b/examples/bls/async_config.pbtxt
@@ -0,0 +1,59 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "bls_async"
+backend: "python"
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+input [
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
diff --git a/examples/bls/async_model.py b/examples/bls/async_model.py
new file mode 100644
index 00000000..4cb0f6dc
--- /dev/null
+++ b/examples/bls/async_model.py
@@ -0,0 +1,160 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import asyncio
+import json
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = json.loads(args["model_config"])
+
+    # You must add the Python 'async' keyword to the beginning of `execute`
+    # function if you want to use `async_exec` function.
+    async def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        responses = []
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for request in requests:
+            # Get INPUT0
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+
+            # Get INPUT1
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            # List of awaitables containing inflight inference responses.
+            inference_response_awaits = []
+            for model_name in ["pytorch", "add_sub"]:
+                # Create inference request object
+                infer_request = pb_utils.InferenceRequest(
+                    model_name=model_name,
+                    requested_output_names=["OUTPUT0", "OUTPUT1"],
+                    inputs=[in_0, in_1],
+                )
+
+                # Store the awaitable inside the array. We don't need
+                # the inference response immediately so we do not `await`
+                # here.
+                inference_response_awaits.append(infer_request.async_exec())
+
+            # Wait for all the inference requests to finish. The execution
+            # of the Python script will be blocked until all the awaitables
+            # are resolved.
+            inference_responses = await asyncio.gather(*inference_response_awaits)
+
+            for infer_response in inference_responses:
+                # Make sure that the inference response doesn't have an error.
+                # If it has an error and you can't proceed with your model
+                # execution you can raise an exception.
+                if infer_response.has_error():
+                    raise pb_utils.TritonModelException(
+                        infer_response.error().message()
+                    )
+
+            # Get the OUTPUT0 from the "pytorch" model inference response
+            pytorch_output0_tensor = pb_utils.get_output_tensor_by_name(
+                inference_responses[0], "OUTPUT0"
+            )
+
+            # Get the OUTPUT1 from the "addsub" model inference response
+            addsub_output1_tensor = pb_utils.get_output_tensor_by_name(
+                inference_responses[1], "OUTPUT1"
+            )
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            #
+            # Because the infer_response of the models contains the final
+            # outputs with correct output names, we can just pass the list
+            # of outputs to the InferenceResponse object.
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[pytorch_output0_tensor, addsub_output1_tensor]
+            )
+            responses.append(inference_response)
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/examples/bls/sync_client.py b/examples/bls/sync_client.py
new file mode 100644
index 00000000..d9483e43
--- /dev/null
+++ b/examples/bls/sync_client.py
@@ -0,0 +1,121 @@
+# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+import numpy as np
+import tritonclient.http as httpclient
+from tritonclient.utils import *
+
+model_name = "bls_sync"
+shape = [4]
+
+with httpclient.InferenceServerClient("localhost:8000") as client:
+    input0_data = np.random.rand(*shape).astype(np.float32)
+    input1_data = np.random.rand(*shape).astype(np.float32)
+    inputs = [
+        httpclient.InferInput(
+            "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype)
+        ),
+        httpclient.InferInput(
+            "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype)
+        ),
+        httpclient.InferInput("MODEL_NAME", [1], np_to_triton_dtype(np.object_)),
+    ]
+    inputs[0].set_data_from_numpy(input0_data)
+    inputs[1].set_data_from_numpy(input1_data)
+
+    # Will perform the inference request on the 'add_sub' model.
+    inputs[2].set_data_from_numpy(np.array(["add_sub"], dtype=np.object_))
+
+    outputs = [
+        httpclient.InferRequestedOutput("OUTPUT0"),
+        httpclient.InferRequestedOutput("OUTPUT1"),
+    ]
+
+    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
+
+    result = response.get_response()
+    output0_data = response.as_numpy("OUTPUT0")
+    output1_data = response.as_numpy("OUTPUT1")
+    print("=========='add_sub' model result==========")
+    print(
+        "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
+            input0_data, input1_data, output0_data
+        )
+    )
+    print(
+        "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format(
+            input0_data, input1_data, output1_data
+        )
+    )
+    if not np.allclose(input0_data + input1_data, output0_data):
+        print("BLS sync example error: incorrect sum")
+        sys.exit(1)
+
+    if not np.allclose(input0_data - input1_data, output1_data):
+        print("BLS sync example error: incorrect difference")
+        sys.exit(1)
+
+    # Will perform the inference request on the pytorch model:
+    inputs[2].set_data_from_numpy(np.array(["pytorch"], dtype=np.object_))
+    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
+
+    result = response.get_response()
+    output0_data = response.as_numpy("OUTPUT0")
+    output1_data = response.as_numpy("OUTPUT1")
+    print("\n")
+    print("=========='pytorch' model result==========")
+    print(
+        "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
+            input0_data, input1_data, output0_data
+        )
+    )
+    print(
+        "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format(
+            input0_data, input1_data, output1_data
+        )
+    )
+    if not np.allclose(input0_data + input1_data, output0_data):
+        print("BLS sync example error: incorrect sum")
+        sys.exit(1)
+
+    if not np.allclose(input0_data - input1_data, output1_data):
+        print("BLS sync example error: incorrect difference")
+        sys.exit(1)
+
+    # Will perform the same inference request on an undefined model. This leads
+    # to an exception:
+    print("\n")
+    print("=========='undefined' model result==========")
+    try:
+        inputs[2].set_data_from_numpy(np.array(["undefined_model"], dtype=np.object_))
+        _ = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
+    except InferenceServerException as e:
+        print(e.message())
+
+    print("PASS: BLS Sync")
+    sys.exit(0)
diff --git a/examples/bls/sync_config.pbtxt b/examples/bls/sync_config.pbtxt
new file mode 100644
index 00000000..d2c4ccd2
--- /dev/null
+++ b/examples/bls/sync_config.pbtxt
@@ -0,0 +1,66 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "bls_sync"
+backend: "python"
+
+input [
+  {
+    name: "MODEL_NAME"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+input [
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
diff --git a/examples/bls/sync_model.py b/examples/bls/sync_model.py
new file mode 100644
index 00000000..f89bed72
--- /dev/null
+++ b/examples/bls/sync_model.py
@@ -0,0 +1,140 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = json.loads(args["model_config"])
+
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        responses = []
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for request in requests:
+            # Get INPUT0
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+
+            # Get INPUT1
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            # Get Model Name
+            model_name = pb_utils.get_input_tensor_by_name(request, "MODEL_NAME")
+
+            # Model Name string
+            model_name_string = model_name.as_numpy()[0]
+
+            # Create inference request object
+            infer_request = pb_utils.InferenceRequest(
+                model_name=model_name_string,
+                requested_output_names=["OUTPUT0", "OUTPUT1"],
+                inputs=[in_0, in_1],
+            )
+
+            # Perform synchronous blocking inference request
+            infer_response = infer_request.exec()
+
+            # Make sure that the inference response doesn't have an error. If
+            # it has an error and you can't proceed with your model execution
+            # you can raise an exception.
+            if infer_response.has_error():
+                raise pb_utils.TritonModelException(infer_response.error().message())
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            #
+            # Because the infer_response of the models contains the final
+            # outputs with correct output names, we can just pass the list
+            # of outputs to the InferenceResponse object.
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=infer_response.output_tensors()
+            )
+            responses.append(inference_response)
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/examples/bls_decoupled/README.md b/examples/bls_decoupled/README.md
new file mode 100644
index 00000000..1f64fee5
--- /dev/null
+++ b/examples/bls_decoupled/README.md
@@ -0,0 +1,163 @@
+<!--
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Example of using BLS with decoupled models
+
+In this section we demonstrate an end-to-end example for
+[BLS](../../README.md#business-logic-scripting) in Python backend. The
+[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md)
+should contain [square](../decoupled) model. The [square](../decoupled) model
+will send 'n' responses where 'n' is the value of input `IN`. For each response,
+output `OUT` will equal the value of `IN`. This example is broken into two
+sections. The first section demonstrates how to perform synchronous BLS requests
+and the second section shows how to execute asynchronous BLS requests.
+
+## Synchronous BLS Requests with Decoupled Models
+
+The goal of `bls_decoupled_sync` model is to calculate the sum of the responses
+returned from the [square](../decoupled) model and return the summation as the final response. The value of input 'IN' will be passed as an input to the
+[square](../decoupled) model which determines how many responses the
+[square](../decoupled) model will generate.
+
+1. Create the model repository:
+
+```console
+mkdir -p models/bls_decoupled_sync/1
+mkdir -p models/square_int32/1
+
+# Copy the Python models
+cp examples/bls_decoupled/sync_model.py models/bls_decoupled_sync/1/model.py
+cp examples/bls_decoupled/sync_config.pbtxt models/bls_decoupled_sync/config.pbtxt
+cp examples/decoupled/square_model.py models/square_int32/1/model.py
+cp examples/decoupled/square_config.pbtxt models/square_int32/config.pbtxt
+```
+
+2. Start the tritonserver:
+
+```
+tritonserver --model-repository `pwd`/models
+```
+
+3. Send inference requests to server:
+
+```
+python3 examples/bls_decoupled/sync_client.py
+```
+
+You should see an output similar to the output below:
+
+```
+==========model result==========
+The square value of [4] is [16]
+
+==========model result==========
+The square value of [2] is [4]
+
+==========model result==========
+The square value of [0] is [0]
+
+==========model result==========
+The square value of [1] is [1]
+
+PASS: BLS Decoupled Sync
+```
+
+The [sync_model.py](./sync_model.py) model file is heavily commented with
+explanations about each of the function calls.
+
+### Explanation of the Client Output
+
+The [client.py](./sync_client.py) sends 4 inference requests to the
+`bls_decoupled_sync` model with the input as: [4], [2], [0] and [1]
+respectively. In compliance with the behavior of the sync BLS model,
+it will expect the output to be the square value of the input.
+
+## Asynchronous BLS Requests with Decoupled Models
+
+In this section we explain how to send multiple BLS requests without waiting for
+their response. Asynchronous execution of BLS requests will not block your
+model execution and can lead to speedups under certain conditions.
+
+The `bls_decoupled_async` model will perform two async BLS requests on the
+[square](../decoupled) model. Then, it will wait until the inference requests
+are completed. It will calculate the sum of the output `OUT` from the
+[square](../decoupled) model in both two requests to construct the final
+inference response object using these tensors.
+
+1. Create the model repository:
+
+```console
+mkdir -p models/bls_decoupled_async/1
+mkdir -p models/square_int32/1
+
+# Copy the Python models
+cp examples/bls_decoupled/async_model.py models/bls_decoupled_async/1/model.py
+cp examples/bls_decoupled/async_config.pbtxt models/bls_decoupled_async/config.pbtxt
+cp examples/decoupled/square_model.py models/square_int32/1/model.py
+cp examples/decoupled/square_config.pbtxt models/square_int32/config.pbtxt
+```
+
+2. Start the tritonserver:
+
+```
+tritonserver --model-repository `pwd`/models
+```
+
+3. Send inference requests to server:
+
+```
+python3 examples/bls_decoupled/async_client.py
+```
+
+You should see an output similar to the output below:
+
+```
+==========model result==========
+Two times the square value of [4] is [32]
+
+==========model result==========
+Two times the square value of [2] is [8]
+
+==========model result==========
+Two times the square value of [0] is [0]
+
+==========model result==========
+Two times the square value of [1] is [2]
+
+PASS: BLS Decoupled Async
+```
+
+The [async_model.py](./async_model.py) model file is heavily commented with
+explanations about each of the function calls.
+
+### Explanation of the Client Output
+
+The [client.py](./async_client.py) sends 4 inference requests to the
+'bls_decoupled_sync' model with the input as: [4], [2], [0] and [1]
+respectively. In compliance with the behavior of sync BLS model model,
+it will expect the output to be two time the square value of the input.
diff --git a/examples/bls_decoupled/async_client.py b/examples/bls_decoupled/async_client.py
new file mode 100644
index 00000000..f701974d
--- /dev/null
+++ b/examples/bls_decoupled/async_client.py
@@ -0,0 +1,68 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+import numpy as np
+import tritonclient.http as httpclient
+from tritonclient.utils import *
+
+model_name = "bls_decoupled_async"
+shape = [1]
+
+with httpclient.InferenceServerClient("localhost:8000") as client:
+    in_values = [4, 2, 0, 1]
+
+    for in_value in in_values:
+        input_data = np.array([in_value], dtype=np.int32)
+        inputs = [
+            httpclient.InferInput(
+                "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+            )
+        ]
+        inputs[0].set_data_from_numpy(input_data)
+        outputs = [httpclient.InferRequestedOutput("SUM")]
+
+        response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
+
+        result = response.get_response()
+        # output_data contains two times of the square value of the input value.
+        output_data = response.as_numpy("SUM")
+        print("==========model result==========")
+        print(
+            "Two times the square value of {} is {}\n".format(input_data, output_data)
+        )
+
+        if not np.allclose((2 * input_data * input_data), output_data):
+            print(
+                "BLS Decoupled Async example error: incorrect output value. Expected {}, got {}.".format(
+                    (2 * input_data * input_data), output_data
+                )
+            )
+            sys.exit(1)
+
+    print("PASS: BLS Decoupled Async")
+    sys.exit(0)
diff --git a/examples/bls_decoupled/async_config.pbtxt b/examples/bls_decoupled/async_config.pbtxt
new file mode 100644
index 00000000..fb999104
--- /dev/null
+++ b/examples/bls_decoupled/async_config.pbtxt
@@ -0,0 +1,45 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "bls_decoupled_async"
+backend: "python"
+
+input [
+  {
+    name: "IN"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "SUM"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
diff --git a/examples/bls_decoupled/async_model.py b/examples/bls_decoupled/async_model.py
new file mode 100644
index 00000000..0a69a628
--- /dev/null
+++ b/examples/bls_decoupled/async_model.py
@@ -0,0 +1,169 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import asyncio
+import json
+
+import numpy as np
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+
+    This model demonstrates how to use BLS with decoupled models.
+
+    This model has a single input and a single output. The model does not
+    support batching.
+      - Input 'IN' shape must be equal to [1], datatype must be INT32.
+      - For each response, output 'SUM' shape must be equal to [1], datatype
+        must be INT32.
+
+    For every request, the model will send a single response that contains an
+    output named 'SUM'. We will send two BLS requests to the square model and
+    the 'SUM' will contain the summation of the 'OUT' response output returned
+    by the square model in the two BLS requests. The input 'IN' determines how
+    many responses the square model will generate.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = json.loads(args["model_config"])
+
+    # You must add the Python 'async' keyword to the beginning of `execute`
+    # function if you want to use `async_exec` function.
+    async def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        # This model does not support batching, so 'request_count' should
+        # always be 1.
+        if len(requests) != 1:
+            raise pb_utils.TritonModelException(
+                "unsupported batch size " + len(requests)
+            )
+
+        response_num = pb_utils.get_input_tensor_by_name(requests[0], "IN")
+
+        # List of awaitables containing inflight inference responses.
+        inference_response_awaits = []
+
+        # For detailed explanation about the inputs of the repeat model, refer
+        # to the example below:
+        # https://github.com/triton-inference-server/python_backend/blob/r22.12/examples/decoupled/square_model.py
+        # Construct two BLS requests
+        for _ in range(2):
+            infer_request = pb_utils.InferenceRequest(
+                model_name="square_int32",
+                inputs=[response_num],
+                requested_output_names=["OUT"],
+            )
+            # Store the awaitable inside the array. We don't need
+            # the inference response immediately so we do not `await`
+            # here.
+            inference_response_awaits.append(infer_request.async_exec(decoupled=True))
+
+        # Wait for all the inference requests to finish. The execution
+        # of the Python script will be blocked until all the awaitables
+        # are resolved.
+        async_responses = await asyncio.gather(*inference_response_awaits)
+
+        # The variable that will store the sum of the responses.
+        response_sum = np.array([0])
+
+        # Iterate over the list of generators of responses returned by the BLS
+        # request. This interface can support zero, one, and many inference
+        # responses per request.
+        for infer_responses in async_responses:
+            for infer_response in infer_responses:
+                # If inference response has an error, raise an exception
+                if infer_response.has_error():
+                    raise pb_utils.TritonModelException(
+                        infer_response.error().message()
+                    )
+
+                # Check for the last empty response.
+                if len(infer_response.output_tensors()) > 0:
+                    response_sum += pb_utils.get_output_tensor_by_name(
+                        infer_response, "OUT"
+                    ).as_numpy()
+
+        response = [
+            pb_utils.InferenceResponse(
+                output_tensors=[pb_utils.Tensor("SUM", response_sum)]
+            )
+        ]
+
+        # Since the model is using the default mode in this example, we
+        # will be returning a single response.
+        return response
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/examples/bls_decoupled/sync_client.py b/examples/bls_decoupled/sync_client.py
new file mode 100644
index 00000000..63156481
--- /dev/null
+++ b/examples/bls_decoupled/sync_client.py
@@ -0,0 +1,63 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+import numpy as np
+import tritonclient.http as httpclient
+from tritonclient.utils import *
+
+model_name = "bls_decoupled_sync"
+shape = [1]
+
+with httpclient.InferenceServerClient("localhost:8000") as client:
+    in_values = [4, 2, 0, 1]
+
+    for in_value in in_values:
+        input_data = np.array([in_value], dtype=np.int32)
+        inputs = [
+            httpclient.InferInput(
+                "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+            )
+        ]
+        inputs[0].set_data_from_numpy(input_data)
+        outputs = [httpclient.InferRequestedOutput("SUM")]
+
+        response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
+
+        result = response.get_response()
+        output_data = response.as_numpy("SUM")
+        print("==========model result==========")
+        print("The square value of {} is {}\n".format(input_data, output_data))
+
+        if not np.allclose(input_data * input_data, output_data):
+            print(
+                "BLS Decoupled Sync example error: incorrect output value. Expected {}, got {}."
+            ).format(input_data * input_data, output_data)
+            sys.exit(1)
+
+    print("PASS: BLS Decoupled Sync")
+    sys.exit(0)
diff --git a/examples/bls_decoupled/sync_config.pbtxt b/examples/bls_decoupled/sync_config.pbtxt
new file mode 100644
index 00000000..f9fe85ea
--- /dev/null
+++ b/examples/bls_decoupled/sync_config.pbtxt
@@ -0,0 +1,45 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "bls_decoupled_sync"
+backend: "python"
+
+input [
+  {
+    name: "IN"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "SUM"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
diff --git a/examples/bls_decoupled/sync_model.py b/examples/bls_decoupled/sync_model.py
new file mode 100644
index 00000000..afc755e5
--- /dev/null
+++ b/examples/bls_decoupled/sync_model.py
@@ -0,0 +1,151 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+import numpy as np
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+
+    This model demonstrates how to use BLS with decoupled models.
+
+    This model has a single input and a single output. The model does not
+    support batching.
+      - Input 'IN' shape must be equal to [1], datatype must be INT32.
+      - For each response, output 'SUM' shape must be equal to [1], datatype
+        must be INT32.
+
+    For every request, the model will send a single response that contains an
+    output named 'SUM'. The 'SUM' will contain the summation of the 'OUT'
+    response output returned by the square model. The input 'IN' determines how
+    many responses the square model will generate.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = json.loads(args["model_config"])
+
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        # This model does not support batching, so 'request_count' should
+        # always be 1.
+        if len(requests) != 1:
+            raise pb_utils.TritonModelException(
+                "unsupported batch size " + len(requests)
+            )
+
+        response_num = pb_utils.get_input_tensor_by_name(requests[0], "IN")
+
+        # For detailed explanation about the inputs of the repeat model, refer
+        # to the example below:
+        # https://github.com/triton-inference-server/python_backend/blob/r22.12/examples/decoupled/square_model.py
+        # Construct the BLS request
+        infer_request = pb_utils.InferenceRequest(
+            model_name="square_int32",
+            inputs=[response_num],
+            requested_output_names=["OUT"],
+        )
+
+        # The variable that will store the sum of the responses.
+        response_sum = np.array([0])
+
+        # Iterate over the generator of responses returned by the BLS request.
+        # This interface can support zero, one, and many inference responses
+        # per request.
+        infer_responses = infer_request.exec(decoupled=True)
+
+        for infer_response in infer_responses:
+            # If inference response has an error, raise an exception
+            if infer_response.has_error():
+                raise pb_utils.TritonModelException(infer_response.error().message())
+
+            # Check for the last empty response.
+            if len(infer_response.output_tensors()) > 0:
+                response_sum += pb_utils.get_output_tensor_by_name(
+                    infer_response, "OUT"
+                ).as_numpy()
+
+        response = [
+            pb_utils.InferenceResponse(
+                output_tensors=[pb_utils.Tensor("SUM", response_sum)]
+            )
+        ]
+
+        # Since the model is using the default mode in this example, we
+        # will be returning a single response.
+        return response
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/examples/custom_metrics/README.md b/examples/custom_metrics/README.md
new file mode 100644
index 00000000..88831e22
--- /dev/null
+++ b/examples/custom_metrics/README.md
@@ -0,0 +1,86 @@
+<!--
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Custom Metrics Example
+
+In this section we demonstrate an end-to-end example for
+[Custom Metrics API](../../README.md#custom-metrics) in Python backend. The
+[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md)
+should contain [custom_metrics](./model.py) model. The
+[custom_metrics](./model.py) model uses
+[Custom Metrics API](../../README.md#custom-metrics) to register and collect
+custom metrics.
+
+## Deploying the Custom Metrics Models
+
+1. Create the model repository:
+
+```console
+mkdir -p models/custom_metrics/1/
+
+# Copy the Python models
+cp examples/custom_metrics/model.py models/custom_metrics/1/model.py
+cp examples/custom_metrics/config.pbtxt models/custom_metrics/config.pbtxt
+```
+
+2. Start the tritonserver:
+
+```
+tritonserver --model-repository `pwd`/models
+```
+
+3. Send inference requests to server:
+
+```
+python3 examples/custom_metrics/client.py
+```
+
+You should see an output similar to the output below in the client terminal:
+
+```
+custom_metrics example: found pattern '# HELP requests_process_latency_ns Cumulative time spent processing requests' in metrics
+custom_metrics example: found pattern '# TYPE requests_process_latency_ns counter' in metrics
+custom_metrics example: found pattern 'requests_process_latency_ns{model="custom_metrics",version="1"}' in metrics
+PASS: custom_metrics
+```
+
+In the terminal that runs Triton Server, you should see an output similar to
+the output below:
+```
+Cumulative requests processing latency: 223406.0
+```
+
+The [model.py](./model.py) model file is heavily commented with
+explanations about each of the function calls.
+
+### Explanation of the Client Output
+
+The [client.py](./client.py) sends a HTTP request with url
+`http://localhost:8002/metrics` to fetch the metrics from Triton server. The
+client then verifies if the custom metrics added in the model file are
+correctly reported.
diff --git a/examples/custom_metrics/client.py b/examples/custom_metrics/client.py
new file mode 100644
index 00000000..64ae31e4
--- /dev/null
+++ b/examples/custom_metrics/client.py
@@ -0,0 +1,98 @@
+# Copyright 2023, NVIDIA CORPORATION& AFFILIATES.All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and / or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+import numpy as np
+import requests
+import tritonclient.http as httpclient
+from tritonclient.utils import *
+
+model_name = "custom_metrics"
+shape = [4]
+
+
+def get_metrics():
+    metrics_url = "http://localhost:8002/metrics"
+    r = requests.get(metrics_url)
+    r.raise_for_status()
+    return r.text
+
+
+with httpclient.InferenceServerClient("localhost:8000") as client:
+    input0_data = np.random.rand(*shape).astype(np.float32)
+    input1_data = np.random.rand(*shape).astype(np.float32)
+    inputs = [
+        httpclient.InferInput(
+            "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype)
+        ),
+        httpclient.InferInput(
+            "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype)
+        ),
+    ]
+
+    inputs[0].set_data_from_numpy(input0_data)
+    inputs[1].set_data_from_numpy(input1_data)
+
+    outputs = [
+        httpclient.InferRequestedOutput("OUTPUT0"),
+        httpclient.InferRequestedOutput("OUTPUT1"),
+    ]
+
+    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
+
+    output0_data = response.as_numpy("OUTPUT0")
+    output1_data = response.as_numpy("OUTPUT1")
+
+    if not np.allclose(input0_data + input1_data, output0_data):
+        print("custom_metrics example error: incorrect sum")
+        sys.exit(1)
+
+    if not np.allclose(input0_data - input1_data, output1_data):
+        print("custom_metrics example error: incorrect difference")
+        sys.exit(1)
+
+    metrics = get_metrics()
+    patterns = [
+        "# HELP requests_process_latency_ns Cumulative time spent processing requests",
+        "# TYPE requests_process_latency_ns counter",
+        'requests_process_latency_ns{model="custom_metrics",version="1"}',
+    ]
+    for pattern in patterns:
+        if pattern not in metrics:
+            print(
+                "custom_metrics example error: missing pattern '{}' in metrics".format(
+                    pattern
+                )
+            )
+            sys.exit(1)
+        else:
+            print(
+                "custom_metrics example: found pattern '{}' in metrics".format(pattern)
+            )
+
+    print("PASS: custom_metrics")
+    sys.exit(0)
diff --git a/examples/custom_metrics/config.pbtxt b/examples/custom_metrics/config.pbtxt
new file mode 100644
index 00000000..2a8192c3
--- /dev/null
+++ b/examples/custom_metrics/config.pbtxt
@@ -0,0 +1,65 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "custom_metrics"
+backend: "python"
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+input [
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+
+instance_group [
+  {
+    count: 3
+    kind: KIND_CPU
+  }
+]
+
diff --git a/examples/custom_metrics/model.py b/examples/custom_metrics/model.py
new file mode 100644
index 00000000..ad3b4e6f
--- /dev/null
+++ b/examples/custom_metrics/model.py
@@ -0,0 +1,174 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import time
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # Parse model_config and extract OUTPUT0 and OUTPUT1 configuration
+        self.model_config = model_config = json.loads(args["model_config"])
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
+
+        # Convert Triton types to numpy types
+        self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
+        self.out1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"])
+
+        # Create a MetricFamily object to report the latency of the model
+        # execution. The 'kind' parameter must be either 'COUNTER' or
+        # 'GAUGE'.
+        # If duplicate name is used, both MetricFamily objects
+        # will reference to the same underlying MetricFamily. If there are two
+        # MetricFamily objects with the same name and same kind but different
+        # description, the original description will be used. Note that
+        # Duplicate name with different kind is not allowed.
+        self.metric_family = pb_utils.MetricFamily(
+            name="requests_process_latency_ns",
+            description="Cumulative time spent processing requests",
+            kind=pb_utils.MetricFamily.COUNTER,  # or pb_utils.MetricFamily.GAUGE
+        )
+
+        # Create a Metric object under the MetricFamily object. The 'labels'
+        # is a dictionary of key-value pairs. You can create multiple Metric
+        # objects under the same MetricFamily object with unique labels. Empty
+        # labels is allowed. The 'labels' parameter is optional. If you don't
+        # specify the 'labels' parameter, empty labels will be used.
+        self.metric = self.metric_family.Metric(
+            labels={"model": "custom_metrics", "version": "1"}
+        )
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        responses = []
+
+        # Record the start time of processing the requests
+        start_ns = time.time_ns()
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for request in requests:
+            # Get INPUT0
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            # Get INPUT1
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
+
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.out0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.out1_dtype))
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[out_tensor_0, out_tensor_1]
+            )
+            responses.append(inference_response)
+
+        # Record the end time of processing the requests
+        end_ns = time.time_ns()
+
+        # Update metric to track cumulative requests processing latency.
+        # There are three operations you can do with the Metric object:
+        #   - Metric.increment(value): Increment the value of the metric by
+        #       the given value. The type of the value is double. The 'COUNTER'
+        #       kind does not support negative value.
+        #   - Metric.set(value): Set the value of the metric to the given
+        #       value. This operation is only supported in 'GAUGE' kind. The
+        #       type of the value is double.
+        #   - Metric.value(): Get the current value of the metric.
+        self.metric.increment(end_ns - start_ns)
+        logger = pb_utils.Logger
+        logger.log_info(
+            "Cumulative requests processing latency: {}".format(self.metric.value())
+        )
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/examples/decoupled/README.md b/examples/decoupled/README.md
new file mode 100644
index 00000000..4301961e
--- /dev/null
+++ b/examples/decoupled/README.md
@@ -0,0 +1,345 @@
+<!--
+# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Decoupled Model Examples
+
+In this section we demonstrate an end-to-end examples for developing and
+serving [decoupled models](../../README.md#decoupled-mode) in Python backend.
+
+[repeat_model.py](repeat_model.py) and [square_model.py](square_model.py) demonstrate
+how to write a decoupled model where each request can generate 0 to many responses.
+These files are heavily commented to describe each function call.
+These example models are designed to show the flexibility available to decoupled models
+and in no way should be used in production. These examples circumvents
+the restriction placed by the
+[instance count](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
+and allows multiple requests to be in process even for single instance. In
+real deployment, the model should not allow the caller thread to return from
+`execute` until that instance is ready to handle another set of requests.
+
+## Deploying the Decoupled Models
+
+1. Create the model repository:
+
+```console
+mkdir -p models/repeat_int32/1
+mkdir -p models/square_int32/1
+
+# Copy the Python models
+cp examples/decoupled/repeat_model.py models/repeat_int32/1/model.py
+cp examples/decoupled/repeat_config.pbtxt models/repeat_int32/config.pbtxt
+cp examples/decoupled/square_model.py models/square_int32/1/model.py
+cp examples/decoupled/square_config.pbtxt models/square_int32/config.pbtxt
+```
+
+2. Start the tritonserver:
+
+```
+tritonserver --model-repository `pwd`/models
+```
+
+## Running inference on Repeat model:
+
+Send inference requests to repeat model using [repeat_client.py](repeat_client.py).
+
+```
+python3 examples/decoupled/repeat_client.py
+```
+
+You should see an output similar to the output below:
+
+```
+stream started...
+async_stream_infer
+model_name: "repeat_int32"
+id: "0"
+inputs {
+  name: "IN"
+  datatype: "INT32"
+  shape: 4
+}
+inputs {
+  name: "DELAY"
+  datatype: "UINT32"
+  shape: 4
+}
+inputs {
+  name: "WAIT"
+  datatype: "UINT32"
+  shape: 1
+}
+outputs {
+  name: "OUT"
+}
+outputs {
+  name: "IDX"
+}
+raw_input_contents: "\004\000\000\000\002\000\000\000\000\000\000\000\001\000\000\000"
+raw_input_contents: "\001\000\000\000\002\000\000\000\003\000\000\000\004\000\000\000"
+raw_input_contents: "\005\000\000\000"
+
+enqueued request 0 to stream...
+infer_response {
+  model_name: "repeat_int32"
+  model_version: "1"
+  id: "0"
+  outputs {
+    name: "IDX"
+    datatype: "UINT32"
+    shape: 1
+  }
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\000\000\000\000"
+  raw_output_contents: "\004\000\000\000"
+}
+
+infer_response {
+  model_name: "repeat_int32"
+  model_version: "1"
+  id: "0"
+  outputs {
+    name: "IDX"
+    datatype: "UINT32"
+    shape: 1
+  }
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\001\000\000\000"
+  raw_output_contents: "\002\000\000\000"
+}
+
+infer_response {
+  model_name: "repeat_int32"
+  model_version: "1"
+  id: "0"
+  outputs {
+    name: "IDX"
+    datatype: "UINT32"
+    shape: 1
+  }
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\002\000\000\000"
+  raw_output_contents: "\000\000\000\000"
+}
+
+infer_response {
+  model_name: "repeat_int32"
+  model_version: "1"
+  id: "0"
+  outputs {
+    name: "IDX"
+    datatype: "UINT32"
+    shape: 1
+  }
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\003\000\000\000"
+  raw_output_contents: "\001\000\000\000"
+}
+
+PASS: repeat_int32
+stream stopped...
+
+```
+
+Look how a single request generated 4 responses.
+
+## Running inference on Square model:
+
+Send inference requests to square model using [square_client.py](square_client.py).
+
+```
+python3 examples/decoupled/square_client.py
+```
+
+You should see an output similar to the output below:
+
+```
+stream started...
+async_stream_infer
+model_name: "square_int32"
+id: "0"
+inputs {
+  name: "IN"
+  datatype: "INT32"
+  shape: 1
+}
+outputs {
+  name: "OUT"
+}
+raw_input_contents: "\004\000\000\000"
+
+enqueued request 0 to stream...
+async_stream_infer
+model_name: "square_int32"
+id: "1"
+inputs {
+  name: "IN"
+  datatype: "INT32"
+  shape: 1
+}
+outputs {
+  name: "OUT"
+}
+raw_input_contents: "\002\000\000\000"
+
+enqueued request 1 to stream...
+async_stream_infer
+model_name: "square_int32"
+id: "2"
+inputs {
+  name: "IN"
+  datatype: "INT32"
+  shape: 1
+}
+outputs {
+  name: "OUT"
+}
+raw_input_contents: "\000\000\000\000"
+
+enqueued request 2 to stream...
+async_stream_infer
+model_name: "square_int32"
+id: "3"
+inputs {
+  name: "IN"
+  datatype: "INT32"
+  shape: 1
+}
+outputs {
+  name: "OUT"
+}
+raw_input_contents: "\001\000\000\000"
+
+enqueued request 3 to stream...
+infer_response {
+  model_name: "square_int32"
+  model_version: "1"
+  id: "0"
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\004\000\000\000"
+}
+
+infer_response {
+  model_name: "square_int32"
+  model_version: "1"
+  id: "1"
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\002\000\000\000"
+}
+
+infer_response {
+  model_name: "square_int32"
+  model_version: "1"
+  id: "0"
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\004\000\000\000"
+}
+
+infer_response {
+  model_name: "square_int32"
+  model_version: "1"
+  id: "3"
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\001\000\000\000"
+}
+
+infer_response {
+  model_name: "square_int32"
+  model_version: "1"
+  id: "1"
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\002\000\000\000"
+}
+
+infer_response {
+  model_name: "square_int32"
+  model_version: "1"
+  id: "0"
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\004\000\000\000"
+}
+
+infer_response {
+  model_name: "square_int32"
+  model_version: "1"
+  id: "0"
+  outputs {
+    name: "OUT"
+    datatype: "INT32"
+    shape: 1
+  }
+  raw_output_contents: "\004\000\000\000"
+}
+
+PASS: square_int32
+stream stopped...
+
+```
+
+Look how responses were delivered out-of-order of requests.
+The generated responses can be tracked to their request using
+the `id` field.
diff --git a/examples/decoupled/repeat_client.py b/examples/decoupled/repeat_client.py
new file mode 100644
index 00000000..7d6a1719
--- /dev/null
+++ b/examples/decoupled/repeat_client.py
@@ -0,0 +1,125 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import queue
+import sys
+from functools import partial
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import *
+
+
+class UserData:
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+
+
+def callback(user_data, result, error):
+    if error:
+        user_data._completed_requests.put(error)
+    else:
+        user_data._completed_requests.put(result)
+
+
+# This client sends a single request to the model with the
+# following tensor data. In compliance with the behavior
+# of repeat_int32 model, it will expect the 4 responses
+# with output: [4], [2], [0] and [1] respectively.
+model_name = "repeat_int32"
+in_value = [4, 2, 0, 1]
+delay_value = [1, 2, 3, 4]
+wait_value = 5
+
+inputs = []
+inputs.append(grpcclient.InferInput("IN", [len(in_value)], "INT32"))
+inputs.append(grpcclient.InferInput("DELAY", [len(delay_value)], "UINT32"))
+inputs.append(grpcclient.InferInput("WAIT", [1], "UINT32"))
+
+outputs = []
+outputs.append(grpcclient.InferRequestedOutput("OUT"))
+outputs.append(grpcclient.InferRequestedOutput("IDX"))
+
+user_data = UserData()
+
+with grpcclient.InferenceServerClient(
+    url="localhost:8001", verbose=True
+) as triton_client:
+    # Establish stream
+    triton_client.start_stream(callback=partial(callback, user_data))
+
+    in_data = np.array(in_value, dtype=np.int32)
+    inputs[0].set_data_from_numpy(in_data)
+    delay_data = np.array(delay_value, dtype=np.uint32)
+    inputs[1].set_data_from_numpy(delay_data)
+    wait_data = np.array([wait_value], dtype=np.uint32)
+    inputs[2].set_data_from_numpy(wait_data)
+
+    request_id = "0"
+    triton_client.async_stream_infer(
+        model_name=model_name,
+        inputs=inputs,
+        request_id=request_id,
+        outputs=outputs,
+    )
+
+    # Retrieve results...
+    recv_count = 0
+    expected_count = len(in_value)
+    result_dict = {}
+    while recv_count < expected_count:
+        data_item = user_data._completed_requests.get()
+        if type(data_item) == InferenceServerException:
+            raise data_item
+        else:
+            this_id = data_item.get_response().id
+            if this_id not in result_dict.keys():
+                result_dict[this_id] = []
+            result_dict[this_id].append((recv_count, data_item))
+
+        recv_count += 1
+
+    # Validate results...
+    if len(result_dict[request_id]) != len(in_value):
+        print(
+            "expected {} many responses for request id {}, got {}".format(
+                len(in_value), request_id, len(result_dict[request_id])
+            )
+        )
+        sys.exit(1)
+
+    result_list = result_dict[request_id]
+    for i in range(len(result_list)):
+        expected_data = np.array([in_value[i]], dtype=np.int32)
+        this_data = result_list[i][1].as_numpy("OUT")
+        if not np.array_equal(expected_data, this_data):
+            print(
+                "incorrect data: expected {}, got {}".format(expected_data, this_data)
+            )
+            sys.exit(1)
+
+    print("PASS: repeat_int32")
+    sys.exit(0)
diff --git a/examples/decoupled/repeat_config.pbtxt b/examples/decoupled/repeat_config.pbtxt
new file mode 100644
index 00000000..d72050e2
--- /dev/null
+++ b/examples/decoupled/repeat_config.pbtxt
@@ -0,0 +1,62 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "repeat_int32"
+backend: "python"
+max_batch_size: 0
+model_transaction_policy {
+  decoupled: True
+}
+input [
+  {
+    name: "IN"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "DELAY"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  },
+  {
+    name: "WAIT"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "OUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "IDX"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+  }
+]
+instance_group [{ kind: KIND_CPU }]
diff --git a/examples/decoupled/repeat_model.py b/examples/decoupled/repeat_model.py
new file mode 100644
index 00000000..b96a6804
--- /dev/null
+++ b/examples/decoupled/repeat_model.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import threading
+import time
+
+import numpy
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+
+    This model demonstrates how to write a decoupled model where each
+    request can generate 0 to many responses.
+
+    This model has three inputs and two outputs. The model does not support
+    batching.
+
+      - Input 'IN' can have any vector shape (e.g. [4] or [12]), datatype must
+      be INT32.
+      - Input 'DELAY' must have the same shape as IN, datatype must be UINT32.
+      - Input 'WAIT' must have shape [1] and datatype UINT32.
+      - For each response, output 'OUT' must have shape [1] and datatype INT32.
+      - For each response, output 'IDX' must have shape [1] and datatype UINT32.
+
+    For a request, the model will send 'n' responses where 'n' is the number of
+    elements in IN.  For the i'th response, OUT will equal the i'th element of
+    IN and IDX will equal the zero-based count of this response for the request.
+    For example, the first response for a request will have IDX = 0 and OUT =
+    IN[0], the second will have IDX = 1 and OUT = IN[1], etc. The model will
+    wait the i'th DELAY, in milliseconds, before sending the i'th response. If
+    IN shape is [0] then no responses will be sent.
+
+    After WAIT milliseconds the model will return from the execute function so
+    that Triton can call execute again with another request. WAIT can be less
+    than the sum of DELAY so that execute returns before all responses are sent.
+    Thus, even if there is only one instance of the model, multiple requests can
+    be processed at the same time, and the responses for multiple requests can
+    be intermixed, depending on the values of DELAY and WAIT.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args["model_config"])
+
+        using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config
+        )
+        if not using_decoupled:
+            raise pb_utils.TritonModelException(
+                """the model `{}` can generate any number of responses per request,
+                enable decoupled transaction policy in model configuration to
+                serve this model""".format(
+                    args["model_name"]
+                )
+            )
+
+        # Get OUT configuration
+        out_config = pb_utils.get_output_config_by_name(model_config, "OUT")
+
+        # Get IDX configuration
+        idx_config = pb_utils.get_output_config_by_name(model_config, "IDX")
+
+        # Convert Triton types to numpy types
+        self.out_dtype = pb_utils.triton_string_to_numpy(out_config["data_type"])
+        self.idx_dtype = pb_utils.triton_string_to_numpy(idx_config["data_type"])
+
+        # Optional parameter to specify the number of elements in the OUT tensor in each response.
+        # Defaults to 1 if not provided. Example: If input 'IN' is [4] and 'output_num_elements' is set to 3,
+        # then 'OUT' will be [4, 4, 4]. If 'output_num_elements' is not specified, 'OUT' will default to [4].
+        parameters = self.model_config.get("parameters", {})
+        self.output_num_elements = int(
+            parameters.get("output_num_elements", {}).get("string_value", 1)
+        )
+
+        # To keep track of response threads so that we can delay
+        # the finalizing the model until all response threads
+        # have completed.
+        self.inflight_thread_count = 0
+        self.inflight_thread_count_lck = threading.Lock()
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. The request.get_response_sender() must be used to
+        get an InferenceResponseSender object associated with the request.
+        Use the InferenceResponseSender.send(response=<infer response object>,
+        flags=<flags>) to send responses.
+
+        In the final response sent using the response sender object, you must
+        set the flags argument to TRITONSERVER_RESPONSE_COMPLETE_FINAL to
+        indicate no responses will be sent for the corresponding request. If
+        there is an error, you can set the error argument when creating a
+        pb_utils.InferenceResponse. Setting the flags argument is optional and
+        defaults to zero. When the flags argument is set to
+        TRITONSERVER_RESPONSE_COMPLETE_FINAL providing the response argument is
+        optional.
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        None
+        """
+
+        # This model does not support batching, so 'request_count' should always
+        # be 1.
+        if len(requests) != 1:
+            raise pb_utils.TritonModelException(
+                "unsupported batch size " + len(requests)
+            )
+
+        in_input = pb_utils.get_input_tensor_by_name(requests[0], "IN").as_numpy()
+        delay_input = pb_utils.get_input_tensor_by_name(requests[0], "DELAY").as_numpy()
+        if in_input.shape != delay_input.shape:
+            raise pb_utils.TritonModelException(
+                f"expected IN and DELAY shape to match, got {list(in_input.shape)} and {list(delay_input.shape)}."
+            )
+
+        # Start a separate thread to send the responses for the request. The
+        # sending back the responses is delegated to this thread.
+        thread = threading.Thread(
+            target=self.response_thread,
+            args=(requests[0].get_response_sender(), in_input, delay_input),
+        )
+
+        # A model using decoupled transaction policy is not required to send all
+        # responses for the current request before returning from the execute.
+        # To demonstrate the flexibility of the decoupled API, we are running
+        # response thread entirely independent of the execute thread.
+        thread.daemon = True
+
+        with self.inflight_thread_count_lck:
+            self.inflight_thread_count += 1
+
+        thread.start()
+
+        # Read WAIT input for wait time, then return so that Triton can call
+        # execute again with another request.
+        wait_input = pb_utils.get_input_tensor_by_name(requests[0], "WAIT").as_numpy()
+        time.sleep(wait_input[0] / 1000)
+
+        # Unlike in non-decoupled model transaction policy, execute function
+        # here returns no response. A return from this function only notifies
+        # Triton that the model instance is ready to receive another request. As
+        # we are not waiting for the response thread to complete here, it is
+        # possible that at any give time the model may be processing multiple
+        # requests. Depending upon the request workload, this may lead to a lot
+        # of requests being processed by a single model instance at a time. In
+        # real-world models, the developer should be mindful of when to return
+        # from execute and be willing to accept next request.
+        return None
+
+    def response_thread(self, response_sender, in_input, delay_input):
+        # The response_sender is used to send response(s) associated with the
+        # corresponding request.  Iterate over input/delay pairs. Wait for DELAY
+        # milliseconds and then create and send a response.
+
+        idx_dtype = self.idx_dtype
+        out_dtype = self.out_dtype
+
+        for idx in range(in_input.size):
+            in_value = in_input[idx]
+            delay_value = delay_input[idx]
+
+            time.sleep(delay_value / 1000)
+
+            idx_output = pb_utils.Tensor("IDX", numpy.array([idx], idx_dtype))
+            out_output = pb_utils.Tensor(
+                "OUT",
+                numpy.full((self.output_num_elements,), in_value, dtype=out_dtype),
+            )
+            response = pb_utils.InferenceResponse(
+                output_tensors=[idx_output, out_output]
+            )
+            response_sender.send(response)
+
+        # We must close the response sender to indicate to Triton that we are
+        # done sending responses for the corresponding request. We can't use the
+        # response sender after closing it. The response sender is closed by
+        # setting the TRITONSERVER_RESPONSE_COMPLETE_FINAL.
+        response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+
+        with self.inflight_thread_count_lck:
+            self.inflight_thread_count -= 1
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        Here we will wait for all response threads to complete sending
+        responses.
+        """
+        print("Finalize invoked")
+
+        inflight_threads = True
+        cycles = 0
+        logging_time_sec = 5
+        sleep_time_sec = 0.1
+        cycle_to_log = logging_time_sec / sleep_time_sec
+        while inflight_threads:
+            with self.inflight_thread_count_lck:
+                inflight_threads = self.inflight_thread_count != 0
+                if cycles % cycle_to_log == 0:
+                    print(
+                        f"Waiting for {self.inflight_thread_count} response threads to complete..."
+                    )
+            if inflight_threads:
+                time.sleep(sleep_time_sec)
+                cycles += 1
+
+        print("Finalize complete...")
diff --git a/examples/decoupled/square_client.py b/examples/decoupled/square_client.py
new file mode 100644
index 00000000..0751f13c
--- /dev/null
+++ b/examples/decoupled/square_client.py
@@ -0,0 +1,129 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import queue
+import sys
+from functools import partial
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import *
+
+
+class UserData:
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+
+
+def callback(user_data, result, error):
+    if error:
+        user_data._completed_requests.put(error)
+    else:
+        user_data._completed_requests.put(result)
+
+
+# This client sends a 4 requests to the model with the
+# input as: [4], [2], [0] and [1] respectively. In
+# compliance with the behavior of square_int32 model,
+# it will expect the 4 responses for the 1st request
+# each with output [4], 2 responses for 2nd request
+# each with output [2], no response for the 3rd request
+# and finally 1 response for the 4th request with output
+# [1]
+model_name = "square_int32"
+in_values = [4, 2, 0, 1]
+inputs = [grpcclient.InferInput("IN", [1], np_to_triton_dtype(np.int32))]
+outputs = [grpcclient.InferRequestedOutput("OUT")]
+
+user_data = UserData()
+
+with grpcclient.InferenceServerClient(
+    url="localhost:8001", verbose=True
+) as triton_client:
+    # Establish stream
+    triton_client.start_stream(callback=partial(callback, user_data))
+
+    # Send specified many requests in parallel
+    for i in range(len(in_values)):
+        in_data = np.array([in_values[i]], dtype=np.int32)
+        inputs[0].set_data_from_numpy(in_data)
+
+        triton_client.async_stream_infer(
+            model_name=model_name,
+            inputs=inputs,
+            request_id=str(i),
+            outputs=outputs,
+        )
+
+    # Retrieve results...
+    recv_count = 0
+    expected_count = sum(in_values)
+    result_dict = {}
+    while recv_count < expected_count:
+        data_item = user_data._completed_requests.get()
+        if type(data_item) == InferenceServerException:
+            raise data_item
+        else:
+            this_id = data_item.get_response().id
+            if this_id not in result_dict.keys():
+                result_dict[this_id] = []
+            result_dict[this_id].append((recv_count, data_item))
+
+        recv_count += 1
+
+    # Validate results...
+    for i in range(len(in_values)):
+        this_id = str(i)
+        if in_values[i] != 0 and this_id not in result_dict.keys():
+            print("response for request id {} not received".format(this_id))
+            sys.exit(1)
+        elif in_values[i] == 0 and this_id in result_dict.keys():
+            print("received unexpected response for request id {}".format(this_id))
+            sys.exit(1)
+        if in_values[i] != 0:
+            if len(result_dict[this_id]) != in_values[i]:
+                print(
+                    "expected {} many responses for request id {}, got {}".format(
+                        in_values[i], this_id, result_dict[this_id]
+                    )
+                )
+                sys.exit(1)
+
+        if in_values[i] != 0:
+            result_list = result_dict[this_id]
+            expected_data = np.array([in_values[i]], dtype=np.int32)
+            for j in range(len(result_list)):
+                this_data = result_list[j][1].as_numpy("OUT")
+                if not np.array_equal(expected_data, this_data):
+                    print(
+                        "incorrect data: expected {}, got {}".format(
+                            expected_data, this_data
+                        )
+                    )
+                    sys.exit(1)
+
+    print("PASS: square_int32")
+    sys.exit(0)
diff --git a/examples/decoupled/square_config.pbtxt b/examples/decoupled/square_config.pbtxt
new file mode 100644
index 00000000..a8af1a8f
--- /dev/null
+++ b/examples/decoupled/square_config.pbtxt
@@ -0,0 +1,48 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "square_int32"
+backend: "python"
+max_batch_size: 0
+model_transaction_policy {
+  decoupled: True
+}
+input [
+  {
+    name: "IN"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "OUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+instance_group [{ kind: KIND_CPU }]
+
diff --git a/examples/decoupled/square_model.py b/examples/decoupled/square_model.py
new file mode 100644
index 00000000..b6f6fafb
--- /dev/null
+++ b/examples/decoupled/square_model.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import threading
+import time
+
+import numpy as np
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+
+    This model demonstrates how to write a decoupled model where each
+    request can generate 0 to many responses.
+
+    This model has one input and one output. The model can support batching,
+    with constraint that each request must be batch-1 request, but the shapes
+    described here refer to the non-batch portion of the shape.
+
+      - Input 'IN' must have shape [1] and datatype INT32.
+      - Output 'OUT' must have shape [1] and datatype INT32.
+
+    For a request, the backend will sent 'n' responses where 'n' is the
+    element in IN. For each response, OUT will equal the element of IN.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args["model_config"])
+
+        using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config
+        )
+        if not using_decoupled:
+            raise pb_utils.TritonModelException(
+                """the model `{}` can generate any number of responses per request,
+                enable decoupled transaction policy in model configuration to
+                serve this model""".format(
+                    args["model_name"]
+                )
+            )
+
+        # Get IN configuration
+        in_config = pb_utils.get_input_config_by_name(model_config, "IN")
+
+        # Validate the shape and data type of IN
+        in_shape = in_config["dims"]
+        if (len(in_shape) != 1) or (in_shape[0] != 1):
+            raise pb_utils.TritonModelException(
+                """the model `{}` requires the shape of 'IN' to be
+                [1], got {}""".format(
+                    args["model_name"], in_shape
+                )
+            )
+        if in_config["data_type"] != "TYPE_INT32":
+            raise pb_utils.TritonModelException(
+                """the model `{}` requires the data_type of 'IN' to be
+                'TYPE_INT32', got {}""".format(
+                    args["model_name"], in_config["data_type"]
+                )
+            )
+
+        # Get OUT configuration
+        out_config = pb_utils.get_output_config_by_name(model_config, "OUT")
+
+        # Validate the shape and data type of OUT
+        out_shape = out_config["dims"]
+        if (len(out_shape) != 1) or (out_shape[0] != 1):
+            raise pb_utils.TritonModelException(
+                """the model `{}` requires the shape of 'OUT' to be
+                [1], got {}""".format(
+                    args["model_name"], out_shape
+                )
+            )
+        if out_config["data_type"] != "TYPE_INT32":
+            raise pb_utils.TritonModelException(
+                """the model `{}` requires the data_type of 'OUT' to be
+                'TYPE_INT32', got {}""".format(
+                    args["model_name"], out_config["data_type"]
+                )
+            )
+
+        self.inflight_thread_count = 0
+        self.inflight_thread_count_lck = threading.Lock()
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. The request.get_response_sender() must be used to
+        get an InferenceResponseSender object associated with the request.
+        Use the InferenceResponseSender.send(response=<infer response object>,
+        flags=<flags>) to send responses.
+
+        In the final response sent using the response sender object, you must
+        set the flags argument to TRITONSERVER_RESPONSE_COMPLETE_FINAL to
+        indicate no responses will be sent for the corresponding request. If
+        there is an error, you can set the error argument when creating a
+        pb_utils.InferenceResponse. Setting the flags argument is optional and
+        defaults to zero. When the flags argument is set to
+        TRITONSERVER_RESPONSE_COMPLETE_FINAL providing the response argument is
+        optional.
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        None
+        """
+
+        # Visit individual request to start processing them. Note that execute
+        # function is not required to wait for all the requests of the current
+        # batch to be processed before returning.
+        for request in requests:
+            self.process_request(request)
+
+        # Unlike in non-decoupled model transaction policy, execute function
+        # here returns no response. A return from this function only notifies
+        # Triton that the model instance is ready to receive another batch of
+        # requests. As we are not waiting for the response thread to complete
+        # here, it is possible that at any give time the model may be processing
+        # multiple batches of requests. Depending upon the request workload,
+        # this may lead to a lot of requests being processed by a single model
+        # instance at a time. In real-world models, the developer should be
+        # mindful of when to return from execute and be willing to accept next
+        # request batch.
+        return None
+
+    def process_request(self, request):
+        # Start a separate thread to send the responses for the request. The
+        # sending back the responses is delegated to this thread.
+        thread = threading.Thread(
+            target=self.response_thread,
+            args=(
+                request.get_response_sender(),
+                pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(),
+            ),
+        )
+
+        # A model using decoupled transaction policy is not required to send all
+        # responses for the current request before returning from the execute.
+        # To demonstrate the flexibility of the decoupled API, we are running
+        # response thread entirely independent of the execute thread.
+        thread.daemon = True
+
+        with self.inflight_thread_count_lck:
+            self.inflight_thread_count += 1
+
+        thread.start()
+
+    def response_thread(self, response_sender, in_input):
+        # The response_sender is used to send response(s) associated with the
+        # corresponding request.
+
+        for idx in range(in_input[0]):
+            out_output = pb_utils.Tensor("OUT", np.array([in_input[0]], np.int32))
+            response = pb_utils.InferenceResponse(output_tensors=[out_output])
+            response_sender.send(response)
+
+        # We must close the response sender to indicate to Triton that we are
+        # done sending responses for the corresponding request. We can't use the
+        # response sender after closing it. The response sender is closed by
+        # setting the TRITONSERVER_RESPONSE_COMPLETE_FINAL.
+        response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+
+        with self.inflight_thread_count_lck:
+            self.inflight_thread_count -= 1
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        Here we will wait for all response threads to complete sending
+        responses.
+        """
+
+        print("Finalize invoked")
+
+        inflight_threads = True
+        cycles = 0
+        logging_time_sec = 5
+        sleep_time_sec = 0.1
+        cycle_to_log = logging_time_sec / sleep_time_sec
+        while inflight_threads:
+            with self.inflight_thread_count_lck:
+                inflight_threads = self.inflight_thread_count != 0
+                if cycles % cycle_to_log == 0:
+                    print(
+                        f"Waiting for {self.inflight_thread_count} response threads to complete..."
+                    )
+            if inflight_threads:
+                time.sleep(sleep_time_sec)
+                cycles += 1
+
+        print("Finalize complete...")
diff --git a/examples/instance_kind/README.md b/examples/instance_kind/README.md
new file mode 100644
index 00000000..360f72a6
--- /dev/null
+++ b/examples/instance_kind/README.md
@@ -0,0 +1,199 @@
+<!--
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Model Instance Kind Example
+
+Triton model configuration allows users to provide kind to [instance group
+settings.](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups)
+A python backend model can be written to respect the kind setting to control
+the execution of a model instance either on CPU or GPU.
+
+In this example, we demonstrate how this can be achieved for your python model.
+We will use a `ResNet50` model as our base model for this example.
+
+## Create a ResNet50 model repository
+
+We will use the files that come with this example to create the model
+repository.
+
+First, download the [client.py](client.py), [config.pbtxt](config.pbtxt),
+[resnet50_labels.txt](resnet50_labels.txt), and [model.py](model.py)
+to your local machine.
+
+Next, in the same directory with the four aforementioned files, create the model
+repository with the following commands:
+```
+mkdir -p models/resnet50/1 &&
+mv model.py models/resnet50/1/ &&
+mv config.pbtxt models/resnet50/
+```
+
+## Pull the Triton Docker images
+
+We need to install Docker and NVIDIA Container Toolkit before proceeding, refer
+to the
+[installation steps](https://github.com/triton-inference-server/server/tree/main/docs#installation).
+
+To pull the latest containers, run the following commands:
+```
+docker pull nvcr.io/nvidia/tritonserver:<yy.mm>-py3
+docker pull nvcr.io/nvidia/tritonserver:<yy.mm>-py3-sdk
+```
+See the installation steps above for the `<yy.mm>` version.
+
+For example, if the latest version is `23.01`, the above commands translate
+to the following:
+```
+docker pull nvcr.io/nvidia/tritonserver:23.01-py3
+docker pull nvcr.io/nvidia/tritonserver:23.01-py3-sdk
+```
+
+Be sure to replace the `<yy.mm>` with the version pulled for all the remaining
+parts of this example.
+
+## Start the Triton Server
+
+At the directory where we copied our resnet50 model (at where the "models"
+folder is located), run the following command:
+```
+docker run --gpus all --shm-size 1G -it --rm -p 8000:8000 -v `pwd`:/instance_kind nvcr.io/nvidia/tritonserver:<yy.mm>-py3 /bin/bash
+```
+
+Inside the container, we need to install `torch`, `torchvision` and `pillow` to run
+this example. We recommend to use `pip` method for the installation:
+
+```
+pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html torchvision==0.14.0+cu117 pillow
+```
+
+Finally, we need to start the Triton Server:
+```
+tritonserver --model-repository /instance_kind/models
+```
+
+To leave the container for the next step, press: `CTRL + P + Q`.
+
+## Start the Triton SDK Container and Test Inference
+
+To start the sdk container, run the following command:
+```
+docker run --gpus all --network=host --pid=host --ipc=host -v `pwd`:/instance_kind -ti nvcr.io/nvidia/tritonserver:<yy.mm>-py3-sdk /bin/bash
+```
+
+The `client.py` requires the following packages to be installed: `torch`,
+`torchvision`, `pillow` and `validators`.  Similarly, we recommend to use `pip`
+method for the installation:
+
+```
+pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html torchvision==0.14.0+cu117 pillow validators
+```
+
+Finally, let's test an inference call with the following command:
+```
+python client.py
+```
+On a first run, a successful inference will print the following at the end:
+```
+Downloading: "https://github.com/NVIDIA/DeepLearningExamples/zipball/torchhub" to /root/.cache/torch/hub/torchhub.zip
+Results is class: TABBY
+PASS: ResNet50
+```
+It may take some time due to `torchhub` downloads, but any future calls
+will be quicker, since the client will use already downloaded artifacts.
+
+## Test Instance Kind
+
+Provided `config.pbtxt` sets the instance group setting to `KIND_CPU`,
+which enables the execution of a model on the CPU.
+To test that your model is actually loaded onto CPU, run the following:
+```
+python client.py -v
+```
+The `-v` argument asks the client to request model's confiuration from
+the server and prints it in your console:
+```
+{
+    ...,
+    "instance_group": [
+        {
+            "name": "resnet50_0",
+            "kind": "KIND_CPU",
+            "count": 1,
+            "gpus": [],
+            "secondary_devices": [],
+            "profile": [],
+            "passive": false,
+            "host_policy": ""
+        }
+    ],
+    ...
+}
+Results is class: TABBY
+PASS: ResNet50 instance kind
+```
+
+Based on the printed model config, we can see that `instance_group` field
+has `kind` entry, which is set to `KIND_CPU`.
+
+To change an `instance_group` parameter to `KIND_GPU`, a user can simply replace
+`KIND_CPU` with `KIND_GPU` in the `config.pbtxt`. After restarting the server
+with an updated config file, a successful inference request with `-v` argument
+will result into the similar output, but with an updated `instance_group` entry:
+```
+{
+    ...,
+    "instance_group": [
+        {
+            "name": "resnet50_0",
+            "kind": "KIND_GPU",
+            "count": 1,
+            "gpus": [
+                0
+            ],
+            "secondary_devices": [],
+            "profile": [],
+            "passive": false,
+            "host_policy": ""
+        }
+    ],
+    ...
+}
+Results is class: TABBY
+PASS: ResNet50 instance kind
+```
+It is also possible to load multiple model instances on CPU and GPU
+if necessary.
+
+Below the instance group setting will create two model instances,
+one on CPU and other on GPU.
+```
+instance_group [{ kind: KIND_CPU }, { kind: KIND_GPU}]
+```
+
+For more information on possible model configurations,
+check out the Triton Server documentation [here](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#model-configuration)
\ No newline at end of file
diff --git a/examples/instance_kind/client.py b/examples/instance_kind/client.py
new file mode 100644
index 00000000..f36c4e2b
--- /dev/null
+++ b/examples/instance_kind/client.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import json
+import sys
+import warnings
+
+import numpy as np
+import torch
+import tritonclient.http as httpclient
+from tritonclient.utils import *
+
+warnings.filterwarnings("ignore")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=False,
+        default="resnet50",
+        help="Model name",
+    )
+    parser.add_argument(
+        "--image_url",
+        type=str,
+        required=False,
+        default="http://images.cocodataset.org/test2017/000000557146.jpg",
+        help="Image URL. Default is:\
+                            http://images.cocodataset.org/test2017/000000557146.jpg",
+    )
+    parser.add_argument(
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="http://github.com/store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "--label_file",
+        type=str,
+        required=False,
+        default="./resnet50_labels.txt",
+        help="Path to the file with text representation \
+                        of available labels",
+    )
+    args = parser.parse_args()
+
+    utils = torch.hub.load(
+        "NVIDIA/DeepLearningExamples:torchhub",
+        "nvidia_convnets_processing_utils",
+        skip_validation=True,
+    )
+
+    try:
+        triton_client = httpclient.InferenceServerClient(args.url)
+    except Exception as e:
+        print("channel creation failed: " + str(e))
+        sys.exit(1)
+
+    with open(args.label_file) as f:
+        labels_dict = {idx: line.strip() for idx, line in enumerate(f)}
+
+    if args.verbose:
+        print(json.dumps(triton_client.get_model_config(args.model_name), indent=4))
+
+    input_name = "INPUT"
+    output_name = "OUTPUT"
+    batch = np.asarray(utils.prepare_input_from_uri(args.image_url))
+
+    input = httpclient.InferInput(input_name, batch.shape, "FP32")
+    output = httpclient.InferRequestedOutput(output_name)
+
+    input.set_data_from_numpy(batch)
+    results = triton_client.infer(
+        model_name=args.model_name, inputs=[input], outputs=[output]
+    )
+
+    output_data = results.as_numpy(output_name)
+    max_id = np.argmax(output_data, axis=1)[0]
+    print("Results is class: {}".format(labels_dict[max_id]))
+
+    print("PASS: ResNet50 instance kind")
+    sys.exit(0)
diff --git a/examples/instance_kind/config.pbtxt b/examples/instance_kind/config.pbtxt
new file mode 100644
index 00000000..f3aee058
--- /dev/null
+++ b/examples/instance_kind/config.pbtxt
@@ -0,0 +1,42 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "resnet50"
+backend: "python"
+max_batch_size: 128
+input {
+    name: "INPUT"
+    data_type: TYPE_FP32
+    format: FORMAT_NCHW
+    dims: [ 3, 224, 224 ]
+  }
+output {
+    name: "OUTPUT"
+    data_type: TYPE_FP32
+    dims: [ 1000 ]
+  }
+
+instance_group [{ kind: KIND_CPU }]
diff --git a/examples/instance_kind/model.py b/examples/instance_kind/model.py
new file mode 100644
index 00000000..baff8e7b
--- /dev/null
+++ b/examples/instance_kind/model.py
@@ -0,0 +1,82 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import to_dlpack
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        """
+        This function initializes pre-trained ResNet50 model,
+        depending on the value specified by an `instance_group` parameter
+        in `config.pbtxt`.
+
+        Depending on what `instance_group` was specified in
+        the config.pbtxt file (KIND_CPU or KIND_GPU), the model instance
+        will be initialised on a cpu, a gpu, or both. If `instance_group` was
+        not specified in the config file, then models will be loaded onto
+        the default device of the framework.
+        """
+        # Here we set up the device onto which our model will beloaded,
+        # based on specified `model_instance_kind` and `model_instance_device_id`
+        # fields.
+        device = "cuda" if args["model_instance_kind"] == "GPU" else "cpu"
+        device_id = args["model_instance_device_id"]
+        self.device = f"{device}:{device_id}"
+        # This example is configured to work with torch=1.13
+        # and torchvision=0.14. Thus, we need to provide a proper tag `0.14.1`
+        # to make sure loaded Resnet50 is compatible with
+        # installed `torchvision`.
+        # Refer to README for installation instructions.
+        self.model = (
+            torch.hub.load(
+                "pytorch/vision:v0.14.1",
+                "resnet50",
+                weights="IMAGENET1K_V2",
+                skip_validation=True,
+            )
+            .to(self.device)
+            .eval()
+        )
+
+    def execute(self, requests):
+        """
+        This function receives a list of requests (`pb_utils.InferenceRequest`),
+        performs inference on every request and appends it to responses.
+        """
+        responses = []
+        for request in requests:
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT")
+            with torch.no_grad():
+                result = self.model(
+                    torch.as_tensor(input_tensor.as_numpy(), device=self.device)
+                )
+            out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT", to_dlpack(result))
+            responses.append(pb_utils.InferenceResponse([out_tensor]))
+        return responses
diff --git a/examples/instance_kind/resnet50_labels.txt b/examples/instance_kind/resnet50_labels.txt
new file mode 100644
index 00000000..2376a285
--- /dev/null
+++ b/examples/instance_kind/resnet50_labels.txt
@@ -0,0 +1,1000 @@
+TENCH
+GOLDFISH
+WHITE SHARK
+TIGER SHARK
+HAMMERHEAD SHARK
+ELECTRIC RAY
+STINGRAY
+ROOSTER
+HEN
+OSTRICH
+BRAMBLING
+GOLDFINCH
+HOUSE FINCH
+SNOWBIRD
+INDIGO FINCH
+ROBIN
+BULBUL
+JAY
+MAGPIE
+CHICKADEE
+WATER OUZEL
+KITE
+BALD EAGLE
+VULTURE
+GREAT GREY OWL
+FIRE SALAMANDER
+NEWT
+EFT
+SPOTTED SALAMANDER
+AXOLOTL
+BULL FROG
+TREE FROG
+TAILED FROG
+LOGGERHEAD
+LEATHERBACK TURTLE
+MUD TURTLE
+TERRAPIN
+BOX TURTLE
+BANDED GECKO
+COMMON IGUANA
+AMERICAN CHAMELEON
+WHIPTAIL
+AGAMA
+FRILLED LIZARD
+ALLIGATOR LIZARD
+GILA MONSTER
+GREEN LIZARD
+AFRICAN CHAMELEON
+KOMODO DRAGON
+AFRICAN CROCODILE
+AMERICAN ALLIGATOR
+TRICERATOPS
+THUNDER SNAKE
+RINGNECK SNAKE
+HOGNOSE SNAKE
+GREEN SNAKE
+KING SNAKE
+GARTER SNAKE
+WATER SNAKE
+VINE SNAKE
+NIGHT SNAKE
+BOA
+ROCK PYTHON
+COBRA
+GREEN MAMBA
+SEA SNAKE
+HORNED VIPER
+DIAMONDBACK
+SIDEWINDER
+TRILOBITE
+HARVESTMAN
+SCORPION
+GARDEN SPIDER
+BARN SPIDER
+GARDEN SPIDER
+BLACK WIDOW
+TARANTULA
+WOLF SPIDER
+TICK
+CENTIPEDE
+GROUSE
+PTARMIGAN
+RUFFED GROUSE
+PRAIRIE CHICKEN
+PEACOCK
+QUAIL
+PARTRIDGE
+AFRICAN GREY
+MACAW
+COCKATOO
+LORIKEET
+COUCAL
+BEE EATER
+HORNBILL
+HUMMINGBIRD
+JACAMAR
+TOUCAN
+DRAKE
+MERGANSER
+GOOSE
+BLACK SWAN
+TUSKER
+ECHIDNA
+PLATYPUS
+WALLABY
+KOALA
+WOMBAT
+JELLYFISH
+SEA ANEMONE
+BRAIN CORAL
+FLATWORM
+NEMATODE
+CONCH
+SNAIL
+SLUG
+SEA SLUG
+CHITON
+CHAMBERED NAUTILUS
+DUNGENESS CRAB
+ROCK CRAB
+FIDDLER CRAB
+KING CRAB
+AMERICAN LOBSTER
+SPINY LOBSTER
+CRAYFISH
+HERMIT CRAB
+ISOPOD
+WHITE STORK
+BLACK STORK
+SPOONBILL
+FLAMINGO
+LITTLE BLUE HERON
+AMERICAN EGRET
+BITTERN
+CRANE
+LIMPKIN
+EUROPEAN GALLINULE
+AMERICAN COOT
+BUSTARD
+RUDDY TURNSTONE
+RED-BACKED SANDPIPER
+REDSHANK
+DOWITCHER
+OYSTERCATCHER
+PELICAN
+KING PENGUIN
+ALBATROSS
+GREY WHALE
+KILLER WHALE
+DUGONG
+SEA LION
+CHIHUAHUA
+JAPANESE SPANIEL
+MALTESE DOG
+PEKINESE
+SHIH-TZU
+BLENHEIM SPANIEL
+PAPILLON
+TOY TERRIER
+RHODESIAN RIDGEBACK
+AFGHAN HOUND
+BASSET
+BEAGLE
+BLOODHOUND
+BLUETICK
+COONHOUND
+WALKER HOUND
+ENGLISH FOXHOUND
+REDBONE
+BORZOI
+IRISH WOLFHOUND
+ITALIAN GREYHOUND
+WHIPPET
+IBIZAN HOUND
+NORWEGIAN ELKHOUND
+OTTERHOUND
+SALUKI
+SCOTTISH DEERHOUND
+WEIMARANER
+STAFFORDSHIRE BULLTERRIER
+STAFFORDSHIRE TERRIER
+BEDLINGTON TERRIER
+BORDER TERRIER
+KERRY BLUE TERRIER
+IRISH TERRIER
+NORFOLK TERRIER
+NORWICH TERRIER
+YORKSHIRE TERRIER
+WIRE-HAIRED FOX TERRIER
+LAKELAND TERRIER
+SEALYHAM TERRIER
+AIREDALE
+CAIRN
+AUSTRALIAN TERRIER
+DANDIE DINMONT
+BOSTON BULL
+MINIATURE SCHNAUZER
+GIANT SCHNAUZER
+STANDARD SCHNAUZER
+SCOTCH TERRIER
+TIBETAN TERRIER
+SILKY TERRIER
+WHEATEN TERRIER
+WHITE TERRIER
+LHASA
+RETRIEVER
+CURLY-COATED RETRIEVER
+GOLDEN RETRIEVER
+LABRADOR RETRIEVER
+CHESAPEAKE BAY RETRIEVER
+SHORT-HAIRED POINTER
+VISLA
+ENGLISH SETTER
+IRISH SETTER
+GORDON SETTER
+BRITTANY SPANIEL
+CLUMBER
+ENGLISH SPRINGER
+WELSH SPRINGER SPANIEL
+COCKER SPANIEL
+SUSSEX SPANIEL
+IRISH WATERSPANIEL
+KUVASZ
+SCHIPPERKE
+GROENENDAEL
+MALINOIS
+BRIARD
+KELPIE
+KOMONDOR
+OLD ENGLISH SHEEPDOG
+SHETLAND SHEEPDOG
+COLLIE
+BORDER COLLIE
+BOUVIER DES FLANDRES
+ROTTWEILER
+GERMAN SHEPHERD
+DOBERMAN
+MINIATURE PINSCHER
+GREATER SWISS MOUNTAIN DOG
+BERNESE MOUNTAIN DOG
+APPENZELLER
+ENTLEBUCHER
+BOXER
+BULL MASTIFF
+TIBETAN MASTIFF
+FRENCH BULLDOG
+GREAT DANE
+SAINT BERNARD
+ESKIMO DOG
+MALAMUTE
+SIBERIAN HUSKY
+DALMATIAN
+AFFENPINSCHER
+BASENJI
+PUG
+LEONBERG
+NEWFOUNDLAND
+GREAT PYRENEES
+SAMOYED
+POMERANIAN
+CHOW
+KEESHOND
+BRABANCON GRIFFON
+PEMBROKE
+CARDIGAN
+TOY POODLE
+MINIATURE POODLE
+STANDARD POODLE
+MEXICAN HAIRLESS
+TIMBER WOLF
+WHITE WOLF
+RED WOLF
+COYOTE
+DINGO
+DHOLE
+AFRICAN HUNTING DOG
+HYENA
+RED FOX
+KIT FOX
+ARCTIC FOX
+GREY FOX
+TABBY
+TIGER CAT
+PERSIAN CAT
+SIAMESE CAT
+EGYPTIAN CAT
+COUGAR
+LYNX
+LEOPARD
+SNOW LEOPARD
+JAGUAR
+LION
+TIGER
+CHEETAH
+BROWN BEAR
+AMERICAN BLACK BEAR
+ICE BEAR
+SLOTH BEAR
+MONGOOSE
+MEERKAT
+TIGER BEETLE
+LADYBUG
+GROUND BEETLE
+LONG-HORNED BEETLE
+LEAF BEETLE
+DUNG BEETLE
+RHINOCEROS BEETLE
+WEEVIL
+FLY
+BEE
+ANT
+GRASSHOPPER
+CRICKET
+WALKING STICK
+COCKROACH
+MANTIS
+CICADA
+LEAFHOPPER
+LACEWING
+DRAGONFLY
+DAMSELFLY
+ADMIRAL
+RINGLET
+MONARCH
+CABBAGE BUTTERFLY
+SULPHUR BUTTERFLY
+LYCAENID
+STARFISH
+SEA URCHIN
+SEA CUCUMBER
+WOOD RABBIT
+HARE
+ANGORA
+HAMSTER
+PORCUPINE
+FOX SQUIRREL
+MARMOT
+BEAVER
+GUINEA PIG
+SORREL
+ZEBRA
+HOG
+WILD BOAR
+WARTHOG
+HIPPOPOTAMUS
+OX
+WATER BUFFALO
+BISON
+RAM
+BIGHORN
+IBEX
+HARTEBEEST
+IMPALA
+GAZELLE
+ARABIAN CAMEL
+LLAMA
+WEASEL
+MINK
+POLECAT
+BLACK-FOOTED FERRET
+OTTER
+SKUNK
+BADGER
+ARMADILLO
+THREE-TOED SLOTH
+ORANGUTAN
+GORILLA
+CHIMPANZEE
+GIBBON
+SIAMANG
+GUENON
+PATAS
+BABOON
+MACAQUE
+LANGUR
+COLOBUS
+PROBOSCIS MONKEY
+MARMOSET
+CAPUCHIN
+HOWLER MONKEY
+TITI
+SPIDER MONKEY
+SQUIRREL MONKEY
+MADAGASCAR CAT
+INDRI
+INDIAN ELEPHANT
+AFRICAN ELEPHANT
+LESSER PANDA
+GIANT PANDA
+BARRACOUTA
+EEL
+COHO
+ROCK BEAUTY
+ANEMONE FISH
+STURGEON
+GAR
+LIONFISH
+PUFFER
+ABACUS
+ABAYA
+ACADEMIC GOWN
+ACCORDION
+ACOUSTIC GUITAR
+AIRCRAFT CARRIER
+AIRLINER
+AIRSHIP
+ALTAR
+AMBULANCE
+AMPHIBIAN
+ANALOG CLOCK
+APIARY
+APRON
+ASHCAN
+ASSAULT RIFLE
+BACKPACK
+BAKERY
+BALANCE BEAM
+BALLOON
+BALLPOINT
+BAND AID
+BANJO
+BANNISTER
+BARBELL
+BARBER CHAIR
+BARBERSHOP
+BARN
+BAROMETER
+BARREL
+BARROW
+BASEBALL
+BASKETBALL
+BASSINET
+BASSOON
+BATHING CAP
+BATH TOWEL
+BATHTUB
+BEACH WAGON
+BEACON
+BEAKER
+BEARSKIN
+BEER BOTTLE
+BEER GLASS
+BELL COTE
+BIB
+BICYCLE-BUILT-FOR-TWO
+BIKINI
+BINDER
+BINOCULARS
+BIRDHOUSE
+BOATHOUSE
+BOBSLED
+BOLO TIE
+BONNET
+BOOKCASE
+BOOKSHOP
+BOTTLECAP
+BOW
+BOW TIE
+BRASS
+BRASSIERE
+BREAKWATER
+BREASTPLATE
+BROOM
+BUCKET
+BUCKLE
+BULLETPROOF VEST
+BULLET TRAIN
+BUTCHER SHOP
+CAB
+CALDRON
+CANDLE
+CANNON
+CANOE
+CAN OPENER
+CARDIGAN
+CAR MIRROR
+CAROUSEL
+CARPENTERS KIT
+CARTON
+CAR WHEEL
+CASH MACHINE
+CASSETTE
+CASSETTE PLAYER
+CASTLE
+CATAMARAN
+CD PLAYER
+CELLO
+CELLULAR TELEPHONE
+CHAIN
+CHAINLINK FENCE
+CHAIN MAIL
+CHAIN SAW
+CHEST
+CHIFFONIER
+CHIME
+CHINA CABINET
+CHRISTMAS STOCKING
+CHURCH
+CINEMA
+CLEAVER
+CLIFF DWELLING
+CLOAK
+CLOG
+COCKTAIL SHAKER
+COFFEE MUG
+COFFEEPOT
+COIL
+COMBINATION LOCK
+COMPUTER KEYBOARD
+CONFECTIONERY
+CONTAINER SHIP
+CONVERTIBLE
+CORKSCREW
+CORNET
+COWBOY BOOT
+COWBOY HAT
+CRADLE
+CRANE
+CRASH HELMET
+CREATE
+CRIB
+CROCK POT
+CROQUET BALL
+CRUTCH
+CUIRASS
+DAM
+DESK
+DESKTOP COMPUTER
+DIAL TELEPHONE
+DIAPER
+DIGITAL CLOCK
+DIGITAL WATCH
+DINING TABLE
+DISHRAG
+DISHWASHER
+DISK BRAKE
+DOCK
+DOGSLED
+DOME
+DOORMAT
+DRILLING PLATFORM
+DRUM
+DRUMSTICK
+DUMBBELL
+DUTCH OVEN
+ELECTRIC FAN
+ELECTRIC GUITAR
+ELECTRIC LOCOMOTIVE
+ENTERTAINMENT CENTER
+ENVELOPE
+ESPRESSO MAKER
+FACE POWDER
+FEATHER BOA
+FILE
+FIREBOAT
+FIRE ENGINE
+FIRE SCREEN
+FLAGPOLE
+FLUTE
+FOLDING CHAIR
+FOOTBALL HELMET
+FORKLIFT
+FOUNTAIN
+FOUNTAIN PEN
+FOUR-POSTER
+FREIGHT CAR
+FRENCH HORN
+FRYING PAN
+FUR COAT
+GARBAGE TRUCK
+GASMASK
+GAS PUMP
+GOBLET
+GO-KART
+GOLF BALL
+GOLFCART
+GONDOLA
+GONG
+GOWN
+GRAND PIANO
+GREENHOUSE
+GRILLE
+GROCERY STORE
+GUILLOTINE
+HAIR SLIDE
+HAIR SPRAY
+HALF TRACK
+HAMMER
+HAMPER
+HAND BLOWER
+HAND-HELD COMPUTER
+HANDKERCHIEF
+HARD DISC
+HARMONICA
+HARP
+HARVESTER
+HATCHET
+HOLSTER
+HOME THEATER
+HONEYCOMB
+HOOK
+HOOPSKIRT
+HORIZONTAL BAR
+HORSE CART
+HOURGLASS
+IPOD
+IRON
+JACK-O-LANTERN
+JEAN
+JEEP
+JERSEY
+JIGSAW PUZZLE
+JINRIKISHA
+JOYSTICK
+KIMONO
+KNEE PAD
+KNOT
+LAB COAT
+LADLE
+LAMPSHADE
+LAPTOP
+LAWN MOWER
+LENS CAP
+LETTER OPENER
+LIBRARY
+LIFEBOAT
+LIGHTER
+LIMOUSINE
+LINER
+LIPSTICK
+LOAFER
+LOTION
+LOUDSPEAKER
+LOUPE
+LUMBERMILL
+MAGNETIC COMPASS
+MAILBAG
+MAILBOX
+MAILLOT
+MAILLOT
+MANHOLE COVER
+MARACA
+MARIMBA
+MASK
+MATCHSTICK
+MAYPOLE
+MAZE
+MEASURING CUP
+MEDICINE CHEST
+MEGALITH
+MICROPHONE
+MICROWAVE
+MILITARY UNIFORM
+MILK CAN
+MINIBUS
+MINISKIRT
+MINIVAN
+MISSILE
+MITTEN
+MIXING BOWL
+MOBILE HOME
+MODEL T
+MODEM
+MONASTERY
+MONITOR
+MOPED
+MORTAR
+MORTARBOARD
+MOSQUE
+MOSQUITO NET
+MOTOR SCOOTER
+MOUNTAIN BIKE
+MOUNTAIN TENT
+MOUSE
+MOUSETRAP
+MOVING VAN
+MUZZLE
+NAIL
+NECK BRACE
+NECKLACE
+NIPPLE
+NOTEBOOK
+OBELISK
+OBOE
+OCARINA
+ODOMETER
+OIL FILTER
+ORGAN
+OSCILLOSCOPE
+OVERSKIRT
+OXCART
+OXYGEN MASK
+PACKET
+PADDLE
+PADDLEWHEEL
+PADLOCK
+PAINTBRUSH
+PAJAMA
+PALACE
+PANPIPE
+PAPER TOWEL
+PARACHUTE
+PARALLEL BARS
+PARK BENCH
+PARKING METER
+PASSENGER CAR
+PATIO
+PAY-PHONE
+PEDESTAL
+PENCIL BOX
+PENCIL SHARPENER
+PERFUME
+PETRI DISH
+PHOTOCOPIER
+PICK
+PICKELHAUBE
+PICKET FENCE
+PICKUP
+PIER
+PIGGY BANK
+PILL BOTTLE
+PILLOW
+PING-PONG BALL
+PINWHEEL
+PIRATE
+PITCHER
+PLANE
+PLANETARIUM
+PLASTIC BAG
+PLATE RACK
+PLOW
+PLUNGER
+POLAROID CAMERA
+POLE
+POLICE VAN
+PONCHO
+POOL TABLE
+POP BOTTLE
+POT
+POTTERS WHEEL
+POWER DRILL
+PRAYER RUG
+PRINTER
+PRISON
+PROJECTILE
+PROJECTOR
+PUCK
+PUNCHING BAG
+PURSE
+QUILL
+QUILT
+RACER
+RACKET
+RADIATOR
+RADIO
+RADIO TELESCOPE
+RAIN BARREL
+RECREATIONAL VEHICLE
+REEL
+REFLEX CAMERA
+REFRIGERATOR
+REMOTE CONTROL
+RESTAURANT
+REVOLVER
+RIFLE
+ROCKING CHAIR
+ROTISSERIE
+RUBBER ERASER
+RUGBY BALL
+RULE
+RUNNING SHOE
+SAFE
+SAFETY PIN
+SALTSHAKER
+SANDAL
+SARONG
+SAX
+SCABBARD
+SCALE
+SCHOOL BUS
+SCHOONER
+SCOREBOARD
+SCREEN
+SCREW
+SCREWDRIVER
+SEAT BELT
+SEWING MACHINE
+SHIELD
+SHOE SHOP
+SHOJI
+SHOPPING BASKET
+SHOPPING CART
+SHOVEL
+SHOWER CAP
+SHOWER CURTAIN
+SKI
+SKI MASK
+SLEEPING BAG
+SLIDE RULE
+SLIDING DOOR
+SLOT
+SNORKEL
+SNOWMOBILE
+SNOWPLOW
+SOAP DISPENSER
+SOCCER BALL
+SOCK
+SOLAR DISH
+SOMBRERO
+SOUP BOWL
+SPACE BAR
+SPACE HEATER
+SPACE SHUTTLE
+SPATULA
+SPEEDBOAT
+SPIDER WEB
+SPINDLE
+SPORTS CAR
+SPOTLIGHT
+STAGE
+STEAM LOCOMOTIVE
+STEEL ARCH BRIDGE
+STEEL DRUM
+STETHOSCOPE
+STOLE
+STONE WALL
+STOPWATCH
+STOVE
+STRAINER
+STREETCAR
+STRETCHER
+STUDIO COUCH
+STUPA
+SUBMARINE
+SUIT
+SUNDIAL
+SUNGLASS
+SUNGLASSES
+SUNSCREEN
+SUSPENSION BRIDGE
+SWAB
+SWEATSHIRT
+SWIMMING TRUNKS
+SWING
+SWITCH
+SYRINGE
+TABLE LAMP
+TANK
+TAPE PLAYER
+TEAPOT
+TEDDY
+TELEVISION
+TENNIS BALL
+THATCH
+THEATER CURTAIN
+THIMBLE
+THRESHER
+THRONE
+TILE ROOF
+TOASTER
+TOBACCO SHOP
+TOILET SEAT
+TORCH
+TOTEM POLE
+TOW TRUCK
+TOYSHOP
+TRACTOR
+TRAILER TRUCK
+TRAY
+TRENCH COAT
+TRICYCLE
+TRIMARAN
+TRIPOD
+TRIUMPHAL ARCH
+TROLLEYBUS
+TROMBONE
+TUB
+TURNSTILE
+TYPEWRITER KEYBOARD
+UMBRELLA
+UNICYCLE
+UPRIGHT
+VACUUM
+VASE
+VAULT
+VELVET
+VENDING MACHINE
+VESTMENT
+VIADUCT
+VIOLIN
+VOLLEYBALL
+WAFFLE IRON
+WALL CLOCK
+WALLET
+WARDROBE
+WARPLANE
+WASHBASIN
+WASHER
+WATER BOTTLE
+WATER JUG
+WATER TOWER
+WHISKEY JUG
+WHISTLE
+WIG
+WINDOW SCREEN
+WINDOW SHADE
+WINDSOR TIE
+WINE BOTTLE
+WING
+WOK
+WOODEN SPOON
+WOOL
+WORM FENCE
+WRECK
+YAWL
+YURT
+WEB SITE
+COMIC BOOK
+CROSSWORD PUZZLE
+STREET SIGN
+TRAFFIC LIGHT
+BOOK JACKET
+MENU
+PLATE
+GUACAMOLE
+CONSOMME
+HOT POT
+TRIFLE
+ICE CREAM
+ICE LOLLY
+FRENCH LOAF
+BAGEL
+PRETZEL
+CHEESEBURGER
+HOTDOG
+MASHED POTATO
+HEAD CABBAGE
+BROCCOLI
+CAULIFLOWER
+ZUCCHINI
+SPAGHETTI SQUASH
+ACORN SQUASH
+BUTTERNUT SQUASH
+CUCUMBER
+ARTICHOKE
+BELL PEPPER
+CARDOON
+MUSHROOM
+GRANNY SMITH
+STRAWBERRY
+ORANGE
+LEMON
+FIG
+PINEAPPLE
+BANANA
+JACKFRUIT
+CUSTARD APPLE
+POMEGRANATE
+HAY
+CARBONARA
+CHOCOLATE SAUCE
+DOUGH
+MEAT LOAF
+PIZZA
+POTPIE
+BURRITO
+RED WINE
+ESPRESSO
+CUP
+EGGNOG
+ALP
+BUBBLE
+CLIFF
+CORAL REEF
+GEYSER
+LAKESIDE
+PROMONTORY
+SANDBAR
+SEASHORE
+VALLEY
+VOLCANO
+BALLPLAYER
+GROOM
+SCUBA DIVER
+RAPESEED
+DAISY
+LADY SLIPPER
+CORN
+ACORN
+HIP
+BUCKEYE
+CORAL FUNGUS
+AGARIC
+GYROMITRA
+STINKHORN
+EARTHSTAR
+HEN-OF-THE-WOODS
+BOLETE
+EAR
+TOILET TISSUE
diff --git a/examples/jax/README.md b/examples/jax/README.md
new file mode 100644
index 00000000..7501d4ac
--- /dev/null
+++ b/examples/jax/README.md
@@ -0,0 +1,114 @@
+<!--
+# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# JAX Example
+
+In this section, we demonstrate an end-to-end example for using
+[JAX](https://jax.readthedocs.io/en/latest/) in Python Backend.
+
+## Create a JAX AddSub model repository
+
+We will use the files that come with this example to create the model
+repository.
+
+First, download the [client.py](client.py), [config.pbtxt](config.pbtxt) and
+[model.py](model.py) to your local machine.
+
+Next, at the directory where the three files located, create the model
+repository with the following commands:
+```
+mkdir -p models/jax/1
+mv model.py models/jax/1
+mv config.pbtxt models/jax
+```
+
+## Pull the Triton Docker images
+
+We need to install Docker and NVIDIA Container Toolkit before proceeding, refer
+to the
+[installation steps](https://github.com/triton-inference-server/server/tree/main/docs#installation).
+
+To pull the latest containers, run the following commands:
+```
+docker pull nvcr.io/nvidia/tritonserver:<yy.mm>-py3
+docker pull nvcr.io/nvidia/tritonserver:<yy.mm>-py3-sdk
+```
+See the installation steps above for the `<yy.mm>` version.
+
+At the time of writing, the latest version is `23.04`, which translates to the
+following commands:
+```
+docker pull nvcr.io/nvidia/tritonserver:23.04-py3
+docker pull nvcr.io/nvidia/tritonserver:23.04-py3-sdk
+```
+
+Be sure to replace the `<yy.mm>` with the version pulled for all the remaining
+parts of this example.
+
+## Start the Triton Server
+
+At the directory where we created the JAX models (at where the "models" folder
+is located), run the following command:
+```
+docker run --gpus all -it --rm -p 8000:8000 -v `pwd`:/jax nvcr.io/nvidia/tritonserver:<yy.mm>-py3 /bin/bash
+```
+
+Inside the container, we need to install JAX to run this example.
+
+We recommend using the `pip` method mentioned in the
+[JAX documentation](https://github.com/google/jax#pip-installation-gpu-cuda).
+Make sure that JAX is available in the same Python environment as other
+dependencies.
+
+To install for this example, run the following command:
+```
+pip3 install --upgrade "jax[cuda12_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+```
+
+Finally, we need to start the Triton Server, run the following command:
+```
+tritonserver --model-repository=/jax/models
+```
+
+To leave the container for the next step, press: `CTRL + P + Q`.
+
+## Test inference
+
+At the directory where the client.py is located, run the following command:
+```
+docker run --rm --net=host -v `pwd`:/jax nvcr.io/nvidia/tritonserver:<yy.mm>-py3-sdk python3 /jax/client.py
+```
+
+A successful inference will print the following at the end:
+```
+INPUT0 ([0.89262384 0.645457   0.18913145 0.17099917]) + INPUT1 ([0.5703733  0.21917151 0.22854741 0.97336507]) = OUTPUT0 ([1.4629972  0.86462855 0.41767886 1.1443642 ])
+INPUT0 ([0.89262384 0.645457   0.18913145 0.17099917]) - INPUT1 ([0.5703733  0.21917151 0.22854741 0.97336507]) = OUTPUT0 ([ 0.32225055  0.4262855  -0.03941596 -0.8023659 ])
+PASS: jax
+```
+Note: You inputs can be different from the above, but the outputs always
+correspond to its inputs.
diff --git a/examples/jax/client.py b/examples/jax/client.py
new file mode 100644
index 00000000..a53d17e9
--- /dev/null
+++ b/examples/jax/client.py
@@ -0,0 +1,82 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+import numpy as np
+import tritonclient.http as httpclient
+from tritonclient.utils import *
+
+model_name = "jax"
+shape = [4]
+
+with httpclient.InferenceServerClient("localhost:8000") as client:
+    input0_data = np.random.rand(*shape).astype(np.float32)
+    input1_data = np.random.rand(*shape).astype(np.float32)
+    inputs = [
+        httpclient.InferInput(
+            "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype)
+        ),
+        httpclient.InferInput(
+            "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype)
+        ),
+    ]
+
+    inputs[0].set_data_from_numpy(input0_data)
+    inputs[1].set_data_from_numpy(input1_data)
+
+    outputs = [
+        httpclient.InferRequestedOutput("OUTPUT0"),
+        httpclient.InferRequestedOutput("OUTPUT1"),
+    ]
+
+    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
+
+    result = response.get_response()
+    output0_data = response.as_numpy("OUTPUT0")
+    output1_data = response.as_numpy("OUTPUT1")
+
+    print(
+        "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
+            input0_data, input1_data, output0_data
+        )
+    )
+    print(
+        "INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format(
+            input0_data, input1_data, output1_data
+        )
+    )
+
+    if not np.allclose(input0_data + input1_data, output0_data):
+        print("jax example error: incorrect sum")
+        sys.exit(1)
+
+    if not np.allclose(input0_data - input1_data, output1_data):
+        print("jax example error: incorrect difference")
+        sys.exit(1)
+
+    print("PASS: jax")
+    sys.exit(0)
diff --git a/examples/jax/config.pbtxt b/examples/jax/config.pbtxt
new file mode 100644
index 00000000..a7e5e5e2
--- /dev/null
+++ b/examples/jax/config.pbtxt
@@ -0,0 +1,59 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "jax"
+backend: "python"
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+input [
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
diff --git a/examples/jax/model.py b/examples/jax/model.py
new file mode 100644
index 00000000..d6840dc9
--- /dev/null
+++ b/examples/jax/model.py
@@ -0,0 +1,156 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+import jax.numpy as jnp
+import numpy as np
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+
+
+def AddSub(input_0, input_1):
+    """
+    Simple AddSub operations in JAX. This outputs the sum and subtraction of
+    the inputs.
+    JAX API: https://jax.readthedocs.io/en/latest/jax.html
+    """
+    output_0 = jnp.add(input_0, input_1)
+    output_1 = jnp.subtract(input_0, input_1)
+    return [output_0, output_1]
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Absolute model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args["model_config"])
+
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+
+        # Get OUTPUT1 configuration
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
+
+        # Convert Triton types to numpy types
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config["data_type"]
+        )
+        self.output1_dtype = pb_utils.triton_string_to_numpy(
+            output1_config["data_type"]
+        )
+
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse.
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        output0_dtype = self.output0_dtype
+        output1_dtype = self.output1_dtype
+
+        responses = []
+
+        # Every Python backend must iterate over every one of the requests and
+        # create a pb_utils.InferenceResponse for each of them.
+        for request in requests:
+            # Get INPUT0
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            # Get INPUT1
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            out_0, out_1 = AddSub(in_0.as_numpy(), in_1.as_numpy())
+
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            out_tensor_0 = pb_utils.Tensor(
+                "OUTPUT0", np.array(out_0).astype(output0_dtype)
+            )
+            out_tensor_1 = pb_utils.Tensor(
+                "OUTPUT1", np.array(out_1).astype(output1_dtype)
+            )
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[out_tensor_0, out_tensor_1]
+            )
+            responses.append(inference_response)
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/examples/preprocessing/README.md b/examples/preprocessing/README.md
new file mode 100644
index 00000000..81ea6923
--- /dev/null
+++ b/examples/preprocessing/README.md
@@ -0,0 +1,71 @@
+<!--
+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Preprocessing Using Python Backend Example
+This example shows how to preprocess your inputs using Python backend before it is passed to the TensorRT model for inference. This ensemble model includes an image preprocessing model (preprocess) and a TensorRT model (resnet50_trt) to do inference.
+
+**1. Converting PyTorch Model to ONNX format:**
+
+Run onnx_exporter.py to convert ResNet50 PyTorch model to ONNX format. Width and height dims are fixed at 224 but dynamic axes arguments for dynamic batching are used. Commands from the 2. and 3. subsections shall be executed within this Docker container.
+
+    docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/pytorch:xx.yy-py3 bash
+    pip install numpy pillow torchvision
+    python onnx_exporter.py --save model.onnx
+
+**2. Create the model repository:**
+
+    mkdir -p model_repository/ensemble_python_resnet50/1
+    mkdir -p model_repository/preprocess/1
+    mkdir -p model_repository/resnet50_trt/1
+
+    # Copy the Python model
+    cp model.py model_repository/preprocess/1
+
+**3. Build a TensorRT engine for the ONNX model**
+
+Set the arguments for enabling fp16 precision --fp16. To enable dynamic shapes use --minShapes, --optShapes, and maxShapes with --explicitBatch:
+
+    trtexec --onnx=model.onnx --saveEngine=./model_repository/resnet50_trt/1/model.plan --explicitBatch --minShapes=input:1x3x224x224 --optShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --fp16
+
+**4. Run the command below to start the server container:**
+
+Under python_backend/examples/preprocessing, run this command to start the server docker container:
+
+    docker run --gpus=all -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v$(pwd):/workspace/ -v/$(pwd)/model_repository:/models nvcr.io/nvidia/tritonserver:xx.yy-py3 bash
+    pip install numpy pillow torchvision
+    tritonserver --model-repository=/models
+
+**5. Start the client to test:**
+
+Under python_backend/examples/preprocessing, run the commands below to start the client Docker container:
+
+    wget https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg -O "mug.jpg"
+    docker run --rm --net=host -v $(pwd):/workspace/ nvcr.io/nvidia/tritonserver:xx.yy-py3-sdk python client.py --image mug.jpg
+    The result of classification is:COFFEE MUG
+
+Here, since we input an image of "mug" and the inference result is "COFFEE MUG" which is correct.
diff --git a/examples/preprocessing/client.py b/examples/preprocessing/client.py
new file mode 100644
index 00000000..1ac107af
--- /dev/null
+++ b/examples/preprocessing/client.py
@@ -0,0 +1,106 @@
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import json
+import sys
+
+import numpy as np
+import tritonclient.grpc as tritongrpcclient
+
+
+def load_image(img_path: str):
+    """
+    Loads an encoded image as an array of bytes.
+
+    """
+    return np.fromfile(img_path, dtype="uint8")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=False,
+        default="ensemble_python_resnet50",
+        help="Model name",
+    )
+    parser.add_argument("--image", type=str, required=True, help="Path to the image")
+    parser.add_argument(
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8001",
+        help="Inference server URL. Default is localhost:8001.",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="http://github.com/store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "--label_file",
+        type=str,
+        default="./model_repository/resnet50_trt/labels.txt",
+        help="Path to the file with text representation of available labels",
+    )
+    args = parser.parse_args()
+
+    try:
+        triton_client = tritongrpcclient.InferenceServerClient(
+            url=args.url, verbose=args.verbose
+        )
+    except Exception as e:
+        print("channel creation failed: " + str(e))
+        sys.exit(1)
+
+    with open(args.label_file) as f:
+        labels_dict = {idx: line.strip() for idx, line in enumerate(f)}
+
+    inputs = []
+    outputs = []
+    input_name = "INPUT"
+    output_name = "OUTPUT"
+    image_data = load_image(args.image)
+    image_data = np.expand_dims(image_data, axis=0)
+
+    inputs.append(tritongrpcclient.InferInput(input_name, image_data.shape, "UINT8"))
+    outputs.append(tritongrpcclient.InferRequestedOutput(output_name))
+
+    inputs[0].set_data_from_numpy(image_data)
+    results = triton_client.infer(
+        model_name=args.model_name, inputs=inputs, outputs=outputs
+    )
+
+    output0_data = results.as_numpy(output_name)
+    print(output0_data)
+    maxs = np.argmax(output0_data, axis=1)
+    print(maxs)
+    print("Result is class: {}".format(labels_dict[maxs[0]]))
diff --git a/examples/preprocessing/model.py b/examples/preprocessing/model.py
new file mode 100644
index 00000000..90259978
--- /dev/null
+++ b/examples/preprocessing/model.py
@@ -0,0 +1,154 @@
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import json
+
+import numpy as np
+import torchvision.transforms as transforms
+
+# triton_python_backend_utils is available in every Triton Python model. You
+# need to use this module to create inference requests and responses. It also
+# contains some utility functions for extracting information from model_config
+# and converting Triton input/output types to numpy types.
+import triton_python_backend_utils as pb_utils
+from PIL import Image
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args["model_config"])
+
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT_0")
+
+        # Convert Triton types to numpy types
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config["data_type"]
+        )
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+
+        output0_dtype = self.output0_dtype
+
+        responses = []
+
+        # Every Python backend must iterate over everyone of the requests
+        # and create a pb_utils.InferenceResponse for each of them.
+        for request in requests:
+            # Get INPUT0
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT_0")
+
+            normalize = transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+            )
+
+            loader = transforms.Compose(
+                [
+                    transforms.Resize([224, 224]),
+                    transforms.CenterCrop(224),
+                    transforms.ToTensor(),
+                    normalize,
+                ]
+            )
+
+            def image_loader(image_name):
+                image = loader(image_name)
+                # expand the dimension to nchw
+                image = image.unsqueeze(0)
+                return image
+
+            img = in_0.as_numpy()
+
+            image = Image.open(io.BytesIO(img.tobytes()))
+            img_out = image_loader(image)
+            img_out = np.array(img_out)
+
+            out_tensor_0 = pb_utils.Tensor("OUTPUT_0", img_out.astype(output0_dtype))
+
+            # Create InferenceResponse. You can set an error here in case
+            # there was a problem with handling this inference request.
+            # Below is an example of how you can set errors in inference
+            # response:
+            #
+            # pb_utils.InferenceResponse(
+            #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[out_tensor_0]
+            )
+            responses.append(inference_response)
+
+        # You should return a list of pb_utils.InferenceResponse. Length
+        # of this list must match the length of `requests` list.
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/examples/preprocessing/model_repository/ensemble_python_resnet50/config.pbtxt b/examples/preprocessing/model_repository/ensemble_python_resnet50/config.pbtxt
new file mode 100644
index 00000000..e0b5c117
--- /dev/null
+++ b/examples/preprocessing/model_repository/ensemble_python_resnet50/config.pbtxt
@@ -0,0 +1,71 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "ensemble_python_resnet50"
+platform: "ensemble"
+max_batch_size: 256
+input [
+  {
+    name: "INPUT"
+    data_type: TYPE_UINT8
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_FP32
+    dims: [ 1000 ]
+  }
+]
+ensemble_scheduling {
+  step [
+    {
+      model_name: "preprocess"
+      model_version: -1
+      input_map {
+        key: "INPUT_0"
+        value: "INPUT"
+      }
+      output_map {
+        key: "OUTPUT_0"
+        value: "preprocessed_image"
+      }
+    },
+    {
+      model_name: "resnet50_trt"
+      model_version: -1
+      input_map {
+        key: "input"
+        value: "preprocessed_image"
+      }
+      output_map {
+        key: "output"
+        value: "OUTPUT"
+      }
+    }
+  ]
+}
diff --git a/examples/preprocessing/model_repository/preprocess/config.pbtxt b/examples/preprocessing/model_repository/preprocess/config.pbtxt
new file mode 100644
index 00000000..fcfbd93b
--- /dev/null
+++ b/examples/preprocessing/model_repository/preprocess/config.pbtxt
@@ -0,0 +1,47 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "preprocess"
+backend: "python"
+max_batch_size: 256
+input [
+{
+    name: "INPUT_0"
+    data_type: TYPE_UINT8
+    dims: [ -1 ]
+}
+]
+
+output [
+{
+    name: "OUTPUT_0"
+    data_type: TYPE_FP32
+    dims: [ 3, 224, 224 ]
+}
+]
+
+instance_group [{ kind: KIND_CPU }]
+
diff --git a/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt b/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt
new file mode 100644
index 00000000..a4b94402
--- /dev/null
+++ b/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt
@@ -0,0 +1,45 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "resnet50_trt"
+platform: "tensorrt_plan"
+max_batch_size: 256
+input [
+{
+    name: "input"
+    data_type: TYPE_FP32
+    dims: [3, -1, -1 ]
+
+}
+]
+output[
+{
+    name: "output"
+    data_type: TYPE_FP32
+    dims: [ 1000 ]
+    label_filename: "labels.txt"
+}
+]
diff --git a/examples/preprocessing/model_repository/resnet50_trt/labels.txt b/examples/preprocessing/model_repository/resnet50_trt/labels.txt
new file mode 100644
index 00000000..2376a285
--- /dev/null
+++ b/examples/preprocessing/model_repository/resnet50_trt/labels.txt
@@ -0,0 +1,1000 @@
+TENCH
+GOLDFISH
+WHITE SHARK
+TIGER SHARK
+HAMMERHEAD SHARK
+ELECTRIC RAY
+STINGRAY
+ROOSTER
+HEN
+OSTRICH
+BRAMBLING
+GOLDFINCH
+HOUSE FINCH
+SNOWBIRD
+INDIGO FINCH
+ROBIN
+BULBUL
+JAY
+MAGPIE
+CHICKADEE
+WATER OUZEL
+KITE
+BALD EAGLE
+VULTURE
+GREAT GREY OWL
+FIRE SALAMANDER
+NEWT
+EFT
+SPOTTED SALAMANDER
+AXOLOTL
+BULL FROG
+TREE FROG
+TAILED FROG
+LOGGERHEAD
+LEATHERBACK TURTLE
+MUD TURTLE
+TERRAPIN
+BOX TURTLE
+BANDED GECKO
+COMMON IGUANA
+AMERICAN CHAMELEON
+WHIPTAIL
+AGAMA
+FRILLED LIZARD
+ALLIGATOR LIZARD
+GILA MONSTER
+GREEN LIZARD
+AFRICAN CHAMELEON
+KOMODO DRAGON
+AFRICAN CROCODILE
+AMERICAN ALLIGATOR
+TRICERATOPS
+THUNDER SNAKE
+RINGNECK SNAKE
+HOGNOSE SNAKE
+GREEN SNAKE
+KING SNAKE
+GARTER SNAKE
+WATER SNAKE
+VINE SNAKE
+NIGHT SNAKE
+BOA
+ROCK PYTHON
+COBRA
+GREEN MAMBA
+SEA SNAKE
+HORNED VIPER
+DIAMONDBACK
+SIDEWINDER
+TRILOBITE
+HARVESTMAN
+SCORPION
+GARDEN SPIDER
+BARN SPIDER
+GARDEN SPIDER
+BLACK WIDOW
+TARANTULA
+WOLF SPIDER
+TICK
+CENTIPEDE
+GROUSE
+PTARMIGAN
+RUFFED GROUSE
+PRAIRIE CHICKEN
+PEACOCK
+QUAIL
+PARTRIDGE
+AFRICAN GREY
+MACAW
+COCKATOO
+LORIKEET
+COUCAL
+BEE EATER
+HORNBILL
+HUMMINGBIRD
+JACAMAR
+TOUCAN
+DRAKE
+MERGANSER
+GOOSE
+BLACK SWAN
+TUSKER
+ECHIDNA
+PLATYPUS
+WALLABY
+KOALA
+WOMBAT
+JELLYFISH
+SEA ANEMONE
+BRAIN CORAL
+FLATWORM
+NEMATODE
+CONCH
+SNAIL
+SLUG
+SEA SLUG
+CHITON
+CHAMBERED NAUTILUS
+DUNGENESS CRAB
+ROCK CRAB
+FIDDLER CRAB
+KING CRAB
+AMERICAN LOBSTER
+SPINY LOBSTER
+CRAYFISH
+HERMIT CRAB
+ISOPOD
+WHITE STORK
+BLACK STORK
+SPOONBILL
+FLAMINGO
+LITTLE BLUE HERON
+AMERICAN EGRET
+BITTERN
+CRANE
+LIMPKIN
+EUROPEAN GALLINULE
+AMERICAN COOT
+BUSTARD
+RUDDY TURNSTONE
+RED-BACKED SANDPIPER
+REDSHANK
+DOWITCHER
+OYSTERCATCHER
+PELICAN
+KING PENGUIN
+ALBATROSS
+GREY WHALE
+KILLER WHALE
+DUGONG
+SEA LION
+CHIHUAHUA
+JAPANESE SPANIEL
+MALTESE DOG
+PEKINESE
+SHIH-TZU
+BLENHEIM SPANIEL
+PAPILLON
+TOY TERRIER
+RHODESIAN RIDGEBACK
+AFGHAN HOUND
+BASSET
+BEAGLE
+BLOODHOUND
+BLUETICK
+COONHOUND
+WALKER HOUND
+ENGLISH FOXHOUND
+REDBONE
+BORZOI
+IRISH WOLFHOUND
+ITALIAN GREYHOUND
+WHIPPET
+IBIZAN HOUND
+NORWEGIAN ELKHOUND
+OTTERHOUND
+SALUKI
+SCOTTISH DEERHOUND
+WEIMARANER
+STAFFORDSHIRE BULLTERRIER
+STAFFORDSHIRE TERRIER
+BEDLINGTON TERRIER
+BORDER TERRIER
+KERRY BLUE TERRIER
+IRISH TERRIER
+NORFOLK TERRIER
+NORWICH TERRIER
+YORKSHIRE TERRIER
+WIRE-HAIRED FOX TERRIER
+LAKELAND TERRIER
+SEALYHAM TERRIER
+AIREDALE
+CAIRN
+AUSTRALIAN TERRIER
+DANDIE DINMONT
+BOSTON BULL
+MINIATURE SCHNAUZER
+GIANT SCHNAUZER
+STANDARD SCHNAUZER
+SCOTCH TERRIER
+TIBETAN TERRIER
+SILKY TERRIER
+WHEATEN TERRIER
+WHITE TERRIER
+LHASA
+RETRIEVER
+CURLY-COATED RETRIEVER
+GOLDEN RETRIEVER
+LABRADOR RETRIEVER
+CHESAPEAKE BAY RETRIEVER
+SHORT-HAIRED POINTER
+VISLA
+ENGLISH SETTER
+IRISH SETTER
+GORDON SETTER
+BRITTANY SPANIEL
+CLUMBER
+ENGLISH SPRINGER
+WELSH SPRINGER SPANIEL
+COCKER SPANIEL
+SUSSEX SPANIEL
+IRISH WATERSPANIEL
+KUVASZ
+SCHIPPERKE
+GROENENDAEL
+MALINOIS
+BRIARD
+KELPIE
+KOMONDOR
+OLD ENGLISH SHEEPDOG
+SHETLAND SHEEPDOG
+COLLIE
+BORDER COLLIE
+BOUVIER DES FLANDRES
+ROTTWEILER
+GERMAN SHEPHERD
+DOBERMAN
+MINIATURE PINSCHER
+GREATER SWISS MOUNTAIN DOG
+BERNESE MOUNTAIN DOG
+APPENZELLER
+ENTLEBUCHER
+BOXER
+BULL MASTIFF
+TIBETAN MASTIFF
+FRENCH BULLDOG
+GREAT DANE
+SAINT BERNARD
+ESKIMO DOG
+MALAMUTE
+SIBERIAN HUSKY
+DALMATIAN
+AFFENPINSCHER
+BASENJI
+PUG
+LEONBERG
+NEWFOUNDLAND
+GREAT PYRENEES
+SAMOYED
+POMERANIAN
+CHOW
+KEESHOND
+BRABANCON GRIFFON
+PEMBROKE
+CARDIGAN
+TOY POODLE
+MINIATURE POODLE
+STANDARD POODLE
+MEXICAN HAIRLESS
+TIMBER WOLF
+WHITE WOLF
+RED WOLF
+COYOTE
+DINGO
+DHOLE
+AFRICAN HUNTING DOG
+HYENA
+RED FOX
+KIT FOX
+ARCTIC FOX
+GREY FOX
+TABBY
+TIGER CAT
+PERSIAN CAT
+SIAMESE CAT
+EGYPTIAN CAT
+COUGAR
+LYNX
+LEOPARD
+SNOW LEOPARD
+JAGUAR
+LION
+TIGER
+CHEETAH
+BROWN BEAR
+AMERICAN BLACK BEAR
+ICE BEAR
+SLOTH BEAR
+MONGOOSE
+MEERKAT
+TIGER BEETLE
+LADYBUG
+GROUND BEETLE
+LONG-HORNED BEETLE
+LEAF BEETLE
+DUNG BEETLE
+RHINOCEROS BEETLE
+WEEVIL
+FLY
+BEE
+ANT
+GRASSHOPPER
+CRICKET
+WALKING STICK
+COCKROACH
+MANTIS
+CICADA
+LEAFHOPPER
+LACEWING
+DRAGONFLY
+DAMSELFLY
+ADMIRAL
+RINGLET
+MONARCH
+CABBAGE BUTTERFLY
+SULPHUR BUTTERFLY
+LYCAENID
+STARFISH
+SEA URCHIN
+SEA CUCUMBER
+WOOD RABBIT
+HARE
+ANGORA
+HAMSTER
+PORCUPINE
+FOX SQUIRREL
+MARMOT
+BEAVER
+GUINEA PIG
+SORREL
+ZEBRA
+HOG
+WILD BOAR
+WARTHOG
+HIPPOPOTAMUS
+OX
+WATER BUFFALO
+BISON
+RAM
+BIGHORN
+IBEX
+HARTEBEEST
+IMPALA
+GAZELLE
+ARABIAN CAMEL
+LLAMA
+WEASEL
+MINK
+POLECAT
+BLACK-FOOTED FERRET
+OTTER
+SKUNK
+BADGER
+ARMADILLO
+THREE-TOED SLOTH
+ORANGUTAN
+GORILLA
+CHIMPANZEE
+GIBBON
+SIAMANG
+GUENON
+PATAS
+BABOON
+MACAQUE
+LANGUR
+COLOBUS
+PROBOSCIS MONKEY
+MARMOSET
+CAPUCHIN
+HOWLER MONKEY
+TITI
+SPIDER MONKEY
+SQUIRREL MONKEY
+MADAGASCAR CAT
+INDRI
+INDIAN ELEPHANT
+AFRICAN ELEPHANT
+LESSER PANDA
+GIANT PANDA
+BARRACOUTA
+EEL
+COHO
+ROCK BEAUTY
+ANEMONE FISH
+STURGEON
+GAR
+LIONFISH
+PUFFER
+ABACUS
+ABAYA
+ACADEMIC GOWN
+ACCORDION
+ACOUSTIC GUITAR
+AIRCRAFT CARRIER
+AIRLINER
+AIRSHIP
+ALTAR
+AMBULANCE
+AMPHIBIAN
+ANALOG CLOCK
+APIARY
+APRON
+ASHCAN
+ASSAULT RIFLE
+BACKPACK
+BAKERY
+BALANCE BEAM
+BALLOON
+BALLPOINT
+BAND AID
+BANJO
+BANNISTER
+BARBELL
+BARBER CHAIR
+BARBERSHOP
+BARN
+BAROMETER
+BARREL
+BARROW
+BASEBALL
+BASKETBALL
+BASSINET
+BASSOON
+BATHING CAP
+BATH TOWEL
+BATHTUB
+BEACH WAGON
+BEACON
+BEAKER
+BEARSKIN
+BEER BOTTLE
+BEER GLASS
+BELL COTE
+BIB
+BICYCLE-BUILT-FOR-TWO
+BIKINI
+BINDER
+BINOCULARS
+BIRDHOUSE
+BOATHOUSE
+BOBSLED
+BOLO TIE
+BONNET
+BOOKCASE
+BOOKSHOP
+BOTTLECAP
+BOW
+BOW TIE
+BRASS
+BRASSIERE
+BREAKWATER
+BREASTPLATE
+BROOM
+BUCKET
+BUCKLE
+BULLETPROOF VEST
+BULLET TRAIN
+BUTCHER SHOP
+CAB
+CALDRON
+CANDLE
+CANNON
+CANOE
+CAN OPENER
+CARDIGAN
+CAR MIRROR
+CAROUSEL
+CARPENTERS KIT
+CARTON
+CAR WHEEL
+CASH MACHINE
+CASSETTE
+CASSETTE PLAYER
+CASTLE
+CATAMARAN
+CD PLAYER
+CELLO
+CELLULAR TELEPHONE
+CHAIN
+CHAINLINK FENCE
+CHAIN MAIL
+CHAIN SAW
+CHEST
+CHIFFONIER
+CHIME
+CHINA CABINET
+CHRISTMAS STOCKING
+CHURCH
+CINEMA
+CLEAVER
+CLIFF DWELLING
+CLOAK
+CLOG
+COCKTAIL SHAKER
+COFFEE MUG
+COFFEEPOT
+COIL
+COMBINATION LOCK
+COMPUTER KEYBOARD
+CONFECTIONERY
+CONTAINER SHIP
+CONVERTIBLE
+CORKSCREW
+CORNET
+COWBOY BOOT
+COWBOY HAT
+CRADLE
+CRANE
+CRASH HELMET
+CREATE
+CRIB
+CROCK POT
+CROQUET BALL
+CRUTCH
+CUIRASS
+DAM
+DESK
+DESKTOP COMPUTER
+DIAL TELEPHONE
+DIAPER
+DIGITAL CLOCK
+DIGITAL WATCH
+DINING TABLE
+DISHRAG
+DISHWASHER
+DISK BRAKE
+DOCK
+DOGSLED
+DOME
+DOORMAT
+DRILLING PLATFORM
+DRUM
+DRUMSTICK
+DUMBBELL
+DUTCH OVEN
+ELECTRIC FAN
+ELECTRIC GUITAR
+ELECTRIC LOCOMOTIVE
+ENTERTAINMENT CENTER
+ENVELOPE
+ESPRESSO MAKER
+FACE POWDER
+FEATHER BOA
+FILE
+FIREBOAT
+FIRE ENGINE
+FIRE SCREEN
+FLAGPOLE
+FLUTE
+FOLDING CHAIR
+FOOTBALL HELMET
+FORKLIFT
+FOUNTAIN
+FOUNTAIN PEN
+FOUR-POSTER
+FREIGHT CAR
+FRENCH HORN
+FRYING PAN
+FUR COAT
+GARBAGE TRUCK
+GASMASK
+GAS PUMP
+GOBLET
+GO-KART
+GOLF BALL
+GOLFCART
+GONDOLA
+GONG
+GOWN
+GRAND PIANO
+GREENHOUSE
+GRILLE
+GROCERY STORE
+GUILLOTINE
+HAIR SLIDE
+HAIR SPRAY
+HALF TRACK
+HAMMER
+HAMPER
+HAND BLOWER
+HAND-HELD COMPUTER
+HANDKERCHIEF
+HARD DISC
+HARMONICA
+HARP
+HARVESTER
+HATCHET
+HOLSTER
+HOME THEATER
+HONEYCOMB
+HOOK
+HOOPSKIRT
+HORIZONTAL BAR
+HORSE CART
+HOURGLASS
+IPOD
+IRON
+JACK-O-LANTERN
+JEAN
+JEEP
+JERSEY
+JIGSAW PUZZLE
+JINRIKISHA
+JOYSTICK
+KIMONO
+KNEE PAD
+KNOT
+LAB COAT
+LADLE
+LAMPSHADE
+LAPTOP
+LAWN MOWER
+LENS CAP
+LETTER OPENER
+LIBRARY
+LIFEBOAT
+LIGHTER
+LIMOUSINE
+LINER
+LIPSTICK
+LOAFER
+LOTION
+LOUDSPEAKER
+LOUPE
+LUMBERMILL
+MAGNETIC COMPASS
+MAILBAG
+MAILBOX
+MAILLOT
+MAILLOT
+MANHOLE COVER
+MARACA
+MARIMBA
+MASK
+MATCHSTICK
+MAYPOLE
+MAZE
+MEASURING CUP
+MEDICINE CHEST
+MEGALITH
+MICROPHONE
+MICROWAVE
+MILITARY UNIFORM
+MILK CAN
+MINIBUS
+MINISKIRT
+MINIVAN
+MISSILE
+MITTEN
+MIXING BOWL
+MOBILE HOME
+MODEL T
+MODEM
+MONASTERY
+MONITOR
+MOPED
+MORTAR
+MORTARBOARD
+MOSQUE
+MOSQUITO NET
+MOTOR SCOOTER
+MOUNTAIN BIKE
+MOUNTAIN TENT
+MOUSE
+MOUSETRAP
+MOVING VAN
+MUZZLE
+NAIL
+NECK BRACE
+NECKLACE
+NIPPLE
+NOTEBOOK
+OBELISK
+OBOE
+OCARINA
+ODOMETER
+OIL FILTER
+ORGAN
+OSCILLOSCOPE
+OVERSKIRT
+OXCART
+OXYGEN MASK
+PACKET
+PADDLE
+PADDLEWHEEL
+PADLOCK
+PAINTBRUSH
+PAJAMA
+PALACE
+PANPIPE
+PAPER TOWEL
+PARACHUTE
+PARALLEL BARS
+PARK BENCH
+PARKING METER
+PASSENGER CAR
+PATIO
+PAY-PHONE
+PEDESTAL
+PENCIL BOX
+PENCIL SHARPENER
+PERFUME
+PETRI DISH
+PHOTOCOPIER
+PICK
+PICKELHAUBE
+PICKET FENCE
+PICKUP
+PIER
+PIGGY BANK
+PILL BOTTLE
+PILLOW
+PING-PONG BALL
+PINWHEEL
+PIRATE
+PITCHER
+PLANE
+PLANETARIUM
+PLASTIC BAG
+PLATE RACK
+PLOW
+PLUNGER
+POLAROID CAMERA
+POLE
+POLICE VAN
+PONCHO
+POOL TABLE
+POP BOTTLE
+POT
+POTTERS WHEEL
+POWER DRILL
+PRAYER RUG
+PRINTER
+PRISON
+PROJECTILE
+PROJECTOR
+PUCK
+PUNCHING BAG
+PURSE
+QUILL
+QUILT
+RACER
+RACKET
+RADIATOR
+RADIO
+RADIO TELESCOPE
+RAIN BARREL
+RECREATIONAL VEHICLE
+REEL
+REFLEX CAMERA
+REFRIGERATOR
+REMOTE CONTROL
+RESTAURANT
+REVOLVER
+RIFLE
+ROCKING CHAIR
+ROTISSERIE
+RUBBER ERASER
+RUGBY BALL
+RULE
+RUNNING SHOE
+SAFE
+SAFETY PIN
+SALTSHAKER
+SANDAL
+SARONG
+SAX
+SCABBARD
+SCALE
+SCHOOL BUS
+SCHOONER
+SCOREBOARD
+SCREEN
+SCREW
+SCREWDRIVER
+SEAT BELT
+SEWING MACHINE
+SHIELD
+SHOE SHOP
+SHOJI
+SHOPPING BASKET
+SHOPPING CART
+SHOVEL
+SHOWER CAP
+SHOWER CURTAIN
+SKI
+SKI MASK
+SLEEPING BAG
+SLIDE RULE
+SLIDING DOOR
+SLOT
+SNORKEL
+SNOWMOBILE
+SNOWPLOW
+SOAP DISPENSER
+SOCCER BALL
+SOCK
+SOLAR DISH
+SOMBRERO
+SOUP BOWL
+SPACE BAR
+SPACE HEATER
+SPACE SHUTTLE
+SPATULA
+SPEEDBOAT
+SPIDER WEB
+SPINDLE
+SPORTS CAR
+SPOTLIGHT
+STAGE
+STEAM LOCOMOTIVE
+STEEL ARCH BRIDGE
+STEEL DRUM
+STETHOSCOPE
+STOLE
+STONE WALL
+STOPWATCH
+STOVE
+STRAINER
+STREETCAR
+STRETCHER
+STUDIO COUCH
+STUPA
+SUBMARINE
+SUIT
+SUNDIAL
+SUNGLASS
+SUNGLASSES
+SUNSCREEN
+SUSPENSION BRIDGE
+SWAB
+SWEATSHIRT
+SWIMMING TRUNKS
+SWING
+SWITCH
+SYRINGE
+TABLE LAMP
+TANK
+TAPE PLAYER
+TEAPOT
+TEDDY
+TELEVISION
+TENNIS BALL
+THATCH
+THEATER CURTAIN
+THIMBLE
+THRESHER
+THRONE
+TILE ROOF
+TOASTER
+TOBACCO SHOP
+TOILET SEAT
+TORCH
+TOTEM POLE
+TOW TRUCK
+TOYSHOP
+TRACTOR
+TRAILER TRUCK
+TRAY
+TRENCH COAT
+TRICYCLE
+TRIMARAN
+TRIPOD
+TRIUMPHAL ARCH
+TROLLEYBUS
+TROMBONE
+TUB
+TURNSTILE
+TYPEWRITER KEYBOARD
+UMBRELLA
+UNICYCLE
+UPRIGHT
+VACUUM
+VASE
+VAULT
+VELVET
+VENDING MACHINE
+VESTMENT
+VIADUCT
+VIOLIN
+VOLLEYBALL
+WAFFLE IRON
+WALL CLOCK
+WALLET
+WARDROBE
+WARPLANE
+WASHBASIN
+WASHER
+WATER BOTTLE
+WATER JUG
+WATER TOWER
+WHISKEY JUG
+WHISTLE
+WIG
+WINDOW SCREEN
+WINDOW SHADE
+WINDSOR TIE
+WINE BOTTLE
+WING
+WOK
+WOODEN SPOON
+WOOL
+WORM FENCE
+WRECK
+YAWL
+YURT
+WEB SITE
+COMIC BOOK
+CROSSWORD PUZZLE
+STREET SIGN
+TRAFFIC LIGHT
+BOOK JACKET
+MENU
+PLATE
+GUACAMOLE
+CONSOMME
+HOT POT
+TRIFLE
+ICE CREAM
+ICE LOLLY
+FRENCH LOAF
+BAGEL
+PRETZEL
+CHEESEBURGER
+HOTDOG
+MASHED POTATO
+HEAD CABBAGE
+BROCCOLI
+CAULIFLOWER
+ZUCCHINI
+SPAGHETTI SQUASH
+ACORN SQUASH
+BUTTERNUT SQUASH
+CUCUMBER
+ARTICHOKE
+BELL PEPPER
+CARDOON
+MUSHROOM
+GRANNY SMITH
+STRAWBERRY
+ORANGE
+LEMON
+FIG
+PINEAPPLE
+BANANA
+JACKFRUIT
+CUSTARD APPLE
+POMEGRANATE
+HAY
+CARBONARA
+CHOCOLATE SAUCE
+DOUGH
+MEAT LOAF
+PIZZA
+POTPIE
+BURRITO
+RED WINE
+ESPRESSO
+CUP
+EGGNOG
+ALP
+BUBBLE
+CLIFF
+CORAL REEF
+GEYSER
+LAKESIDE
+PROMONTORY
+SANDBAR
+SEASHORE
+VALLEY
+VOLCANO
+BALLPLAYER
+GROOM
+SCUBA DIVER
+RAPESEED
+DAISY
+LADY SLIPPER
+CORN
+ACORN
+HIP
+BUCKEYE
+CORAL FUNGUS
+AGARIC
+GYROMITRA
+STINKHORN
+EARTHSTAR
+HEN-OF-THE-WOODS
+BOLETE
+EAR
+TOILET TISSUE
diff --git a/examples/preprocessing/onnx_exporter.py b/examples/preprocessing/onnx_exporter.py
new file mode 100644
index 00000000..3be47b57
--- /dev/null
+++ b/examples/preprocessing/onnx_exporter.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import os
+
+import torch
+import torchvision.models as models
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--save", default="model.onnx")
+    args = parser.parse_args()
+
+    resnet50 = models.resnet50(pretrained=True)
+    dummy_input = torch.randn(1, 3, 224, 224)
+    resnet50 = resnet50.eval()
+
+    torch.onnx.export(
+        resnet50,
+        dummy_input,
+        args.save,
+        export_params=True,
+        opset_version=10,
+        do_constant_folding=True,
+        input_names=["input"],
+        output_names=["output"],
+        dynamic_axes={
+            "input": {0: "batch_size", 2: "height", 3: "width"},
+            "output": {0: "batch_size"},
+        },
+    )
+
+    print("Saved {}".format(args.save))
diff --git a/examples/pytorch/client.py b/examples/pytorch/client.py
index 43115ff1..af1abd39 100644
--- a/examples/pytorch/client.py
+++ b/examples/pytorch/client.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,11 +24,11 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from tritonclient.utils import *
-import tritonclient.grpc as grpcclient
-import tritonclient.http as httpclient
+import sys
 
 import numpy as np
+import tritonclient.http as httpclient
+from tritonclient.utils import *
 
 model_name = "pytorch"
 shape = [4]
@@ -37,10 +37,12 @@
     input0_data = np.random.rand(*shape).astype(np.float32)
     input1_data = np.random.rand(*shape).astype(np.float32)
     inputs = [
-        httpclient.InferInput("INPUT0", input0_data.shape,
-                              np_to_triton_dtype(input0_data.dtype)),
-        httpclient.InferInput("INPUT1", input1_data.shape,
-                              np_to_triton_dtype(input1_data.dtype)),
+        httpclient.InferInput(
+            "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype)
+        ),
+        httpclient.InferInput(
+            "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype)
+        ),
     ]
 
     inputs[0].set_data_from_numpy(input0_data)
@@ -51,13 +53,30 @@
         httpclient.InferRequestedOutput("OUTPUT1"),
     ]
 
-    response = client.infer(model_name,
-                            inputs,
-                            request_id=str(1),
-                            outputs=outputs)
+    response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)
 
     result = response.get_response()
-    print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
-        input0_data, input1_data, response.as_numpy("OUTPUT0")))
-    print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format(
-        input0_data, input1_data, response.as_numpy("OUTPUT1")))
+    output0_data = response.as_numpy("OUTPUT0")
+    output1_data = response.as_numpy("OUTPUT1")
+
+    print(
+        "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
+            input0_data, input1_data, output0_data
+        )
+    )
+    print(
+        "INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format(
+            input0_data, input1_data, output1_data
+        )
+    )
+
+    if not np.allclose(input0_data + input1_data, output0_data):
+        print("pytorch example error: incorrect sum")
+        sys.exit(1)
+
+    if not np.allclose(input0_data - input1_data, output1_data):
+        print("pytorch example error: incorrect difference")
+        sys.exit(1)
+
+    print("PASS: pytorch")
+    sys.exit(0)
diff --git a/examples/pytorch/config.pbtxt b/examples/pytorch/config.pbtxt
index 19feabd2..6ac109bf 100644
--- a/examples/pytorch/config.pbtxt
+++ b/examples/pytorch/config.pbtxt
@@ -32,7 +32,6 @@ input [
     name: "INPUT0"
     data_type: TYPE_FP32
     dims: [ 4 ]
-    
   }
 ]
 input [
@@ -40,7 +39,6 @@ input [
     name: "INPUT1"
     data_type: TYPE_FP32
     dims: [ 4 ]
-    
   }
 ]
 output [
diff --git a/examples/pytorch/model.py b/examples/pytorch/model.py
index 645629b7..89b0c8a2 100644
--- a/examples/pytorch/model.py
+++ b/examples/pytorch/model.py
@@ -24,16 +24,14 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
-import sys
 import json
-from torch import nn
 
 # triton_python_backend_utils is available in every Triton Python model. You
 # need to use this module to create inference requests and responses. It also
 # contains some utility functions for extracting information from model_config
 # and converting Triton input/output types to numpy types.
 import triton_python_backend_utils as pb_utils
+from torch import nn
 
 
 class AddSubNet(nn.Module):
@@ -46,8 +44,6 @@ def __init__(self):
         super(AddSubNet, self).__init__()
 
     def forward(self, input0, input1):
-        """ 
-        """
         return (input0 + input1), (input0 - input1)
 
 
@@ -59,7 +55,7 @@ class TritonPythonModel:
     def initialize(self, args):
         """`initialize` is called only once when the model is being loaded.
         Implementing `initialize` function is optional. This function allows
-        the model to intialize any state associated with this model.
+        the model to initialize any state associated with this model.
 
         Parameters
         ----------
@@ -74,21 +70,21 @@ def initialize(self, args):
         """
 
         # You must parse model_config. JSON string is not parsed here
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
         # Get OUTPUT0 configuration
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
 
         # Get OUTPUT1 configuration
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         # Convert Triton types to numpy types
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
         # Instantiate the PyTorch model
         self.add_sub_model = AddSubNet()
@@ -132,10 +128,8 @@ def execute(self, requests):
 
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
-            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                           out_0.astype(output0_dtype))
-            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                           out_1.astype(output1_dtype))
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
 
             # Create InferenceResponse. You can set an error here in case
             # there was a problem with handling this inference request.
@@ -143,9 +137,10 @@ def execute(self, requests):
             # response:
             #
             # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occured"))
+            #    output_tensors=..., TritonError("An error occurred"))
             inference_response = pb_utils.InferenceResponse(
-                output_tensors=[out_tensor_0, out_tensor_1])
+                output_tensors=[out_tensor_0, out_tensor_1]
+            )
             responses.append(inference_response)
 
         # You should return a list of pb_utils.InferenceResponse. Length
@@ -157,4 +152,4 @@ def finalize(self):
         Implementing `finalize` function is optional. This function allows
         the model to perform any necessary clean ups before exit.
         """
-        print('Cleaning up...')
+        print("Cleaning up...")
diff --git a/inferentia/README.md b/inferentia/README.md
new file mode 100644
index 00000000..fb0de4f7
--- /dev/null
+++ b/inferentia/README.md
@@ -0,0 +1,350 @@
+<!--
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Using Triton with Inferentia 1
+
+Starting from 21.11 release, Triton supports
+[AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/)
+and the [Neuron Runtime](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-intro/get-started.html).
+
+## Table of Contents
+
+- [Using Triton with Inferentia 1](#using-triton-with-inferentia-1)
+  - [Table of Contents](#table-of-contents)
+  - [Inferentia setup](#inferentia-setup)
+  - [Setting up the Inferentia model](#setting-up-the-inferentia-model)
+    - [PyTorch](#pytorch)
+    - [TensorFlow](#tensorflow)
+  - [Serving Inferentia model in Triton](#serving-inferentia-model-in-triton)
+    - [Using Triton's Dynamic Batching](#using-tritons-dynamic-batching)
+  - [Testing Inferentia Setup for Accuracy](#testing-inferentia-setup-for-accuracy)
+
+## Inferentia setup
+
+First step of running Triton with Inferentia is to create an AWS Inferentia
+ instance with Deep Learning AMI (tested with Ubuntu 18.04).
+`ssh -i <private-key-name>.pem ubuntu@<instance address>`
+Note: It is recommended to set your storage space to greater than default value
+of 110 GiB. The current version of Triton has been tested
+with storage of 500 GiB.
+
+After logging into the inf1* instance, you will need to clone
+[this current Github repo](https://github.com/triton-inference-server/python_backend).
+ Follow [steps on Github to set up ssh access](https://docs.github.com/en/authentication/connecting-to-github-with-ssh)
+or simply clone with https.
+Clone this repo with Github to home repo `/home/ubuntu`.
+
+```
+ chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh
+ sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh
+```
+
+Then, start the Triton instance with:
+```
+ docker run --device /dev/neuron0 <more neuron devices> -v /home/ubuntu/python_backend:/home/ubuntu/python_backend -v /lib/udev:/mylib/udev --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:<xx.yy>-py3
+```
+Note 1: The user would need to list any neuron device to run during container initialization.
+For example, to use 4 neuron devices on an instance, the user would need to run with:
+```
+ docker run --device /dev/neuron0 --device /dev/neuron1 --device /dev/neuron2 --device /dev/neuron3 ...`
+```
+Note 2: `/mylib/udev` is used for Neuron parameter passing.
+
+Note 3: For Triton container version xx.yy, please refer to
+[Triton Inference Server Container Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html).
+ The current build script has been tested with container version `21.10`.
+
+After starting the Triton container, go into the `python_backend` folder and run the setup script.
+```
+ source /home/ubuntu/python_backend/inferentia/scripts/setup.sh
+```
+This script will:
+1. Install necessary dependencies
+2. Install [neuron-cc](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/index.html), the Neuron compiler.
+3. Install neuron framework packages as per your preference e.g., either pytorch, or tensorflow or both.
+
+There are user configurable options available for the script as well.
+Please use the `-h` or `--help` options to learn about more configurable options.
+
+## Setting up the Inferentia model
+
+Currently, we only support [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html)
+and [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/index.html)
+workflows for execution on inferentia.
+
+The user is required to create their own `*.pt` (for pytorch) or `*.savedmodels`
+(for tensorflow) models. This is a critical step since Inferentia will need
+the underlying `.NEFF` graph to execute the inference request. Please refer to:
+
+- [Neuron compiler CLI Reference Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/command-line-reference.html)
+- [PyTorch-Neuron trace python API](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/api-compilation-python-api.html)
+- [PyTorch Tutorials](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/tutorials/index.html)
+- [TensorFlow Tutorials](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/tutorials/index.html)
+for guidance on how to compile models.
+
+### PyTorch
+
+For PyTorch, we support models traced by [PyTorch-Neuron trace python API](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/api-compilation-python-api.html)
+for execution on Inferentia.
+Once the TorchScript model supporting Inferentia is obtained, use the
+[gen_triton_model.py](scripts/gen_triton_model.py) script to generate
+triton python model directory.
+
+An example invocation for the `gen_triton_model.py` for PyTorch model can look like:
+
+```
+ python3 inferentia/scripts/gen_triton_model.py --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model /home/ubuntu/bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4
+```
+
+In order for the script to treat the compiled model as TorchScript
+model, `--model_type pytorch` needs to be provided.
+
+NOTE: Due to the absence of metadata for inputs and outputs in a
+TorchScript model - name, datatype and shape of tensor of
+both the inputs and outputs must be provided to the above script
+and the name must follow a specific naming convention i.e.
+`<name>__<index>`. Where `<name>` can be any string and `<index>`
+refers to the position of the corresponding input/output. This
+means if there are two inputs and two outputs they must be named
+as: "INPUT__0", "INPUT__1" and "OUTPUT__0", "OUTPUT__1" such
+that "INPUT__0" refers to first input and INPUT__1 refers to the
+second input, etc.
+
+Additionally, `--neuron_core_range` specifies the neuron cores to
+be used while serving this models. Currently, only
+`torch.neuron.DataParallel()` mode is supported. See
+[Data Parallel Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/appnotes/perf/torch-neuron-dataparallel-app-note.html)
+for more information. Triton model instance count can be specified
+by using  `--triton_model_instance_count` option. The neuron
+cores will be equally distributed among all instances. For example,
+in case of two triton model instances and 4 neuron cores, the first
+instance will be loaded on on cores 0-1 and second instance will be
+loaded on cores 2-3. To best engage inferentia device, try setting
+the number of neuron cores to be a proper multiple of the instance
+count.
+
+### TensorFlow
+
+For TensorFlow, the model must be compiled for AWS Neuron. See
+[AWS Neuron TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/tutorials/index.html)
+tutorials to learn how to get a compiled model that uses Neuron
+cores. Currently, the code is tested only on `tensorflow==1.15`.
+
+Once the compiled model is obtained use [gen_triton_model.py](scripts/gen_triton_model.py)
+script to generate triton python model directory.
+
+An example invocation for the `gen_triton_model.py` for TensorFlow model can look like:
+
+```
+ python3 gen_triton_model.py --model_type tensorflow --compiled_model /home/ubuntu/inferentia-poc-2.0/scripts-rn50-tf-native/resnet50_mlperf_opt_fp16_compiled_b5_nc1/1 --neuron_core_range 0:3  --triton_model_dir rn50-1neuroncores-bs1x1
+```
+
+NOTE: Unlike TorchScript model, TensorFlow SavedModel stores sufficient
+metadata to detect the name, datatype and shape of the input and output
+tensors for the model. By default, the script will assume the compiled
+model to be torchscript. In order for it to treat the compiled model
+as TF savedmodel, `--model_type tensorflow` needs to be provided.
+The input and output details are read from the model itself. The user
+must have [`tensorflow`](https://www.tensorflow.org/install/pip) python
+module installed in order to use this script for tensorflow models.
+
+Similar to PyTorch, `--neuron_core_range` and `--triton_model_instance_count`
+can be used to specify the neuron core range and number of triton model
+instances. However, the neuron core indices don't point to a specific
+neuron core in the chip. For TensorFlow, we use deprecated feature of
+`NEURONCORE_GROUP_SIZES` to load model. The model in this case will be loaded on
+next available Neuron cores and not specific ones. See
+[Parallel Execution using NEURONCORE_GROUP_SIZES](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/appnotes/perf/parallel-ncgs.html?highlight=NEURONCORE_GROUP_SIZES)
+for more information.
+
+Another note, since Neuron-Tensorflow(unlike Neuron-Python) does not have
+built-in functions for running a model for multiple cores, `model.py` will
+distribute the workload by splitting the input tensor across available cores.
+It is recommended the first dimension for the inputs be `None` if the user enables
+processing across multiple cores.
+
+Please use the `-h` or `--help` options in `gen_triton_model.py` to
+learn about more configurable options.
+
+## Serving Inferentia model in Triton
+
+The `gen_triton_model.py` should create a triton model directory with following
+structutre:
+
+```
+bert-large-mlperf-bs1x4
+ |
+ |- 1
+ |  |- model.py
+ |
+ |- config.pbtxt
+```
+
+Look at the usage message of the script to understand each option.
+
+The script will generate a model directory with the user-provided
+name. Move that model directory to Triton's model repository.
+Ensure the compiled model path provided to the script points to
+a valid torchscript file or tensorflow savedmodel.
+
+Now, the server can be launched with the model as below:
+
+```
+ tritonserver --model-repository <path_to_model_repository>
+```
+
+Note:
+
+1. The `config.pbtxt` and `model.py` should be treated as
+starting point. The users can customize these files as per
+their need.
+2. Triton Inferentia is currently tested with a **single** model.
+
+### Using Triton's Dynamic Batching
+
+To enable dynamic batching, `--enable_dynamic_batching`
+flag needs to be specified. `gen_triton_model.py` supports following three
+options for configuring [Triton's dynamic batching](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md):
+
+1. `--preferred_batch_size`: Please refer to [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#preferred-batch-sizes) for details on preferred batch size. To optimize
+   performance, this is recommended to be multiples of engaged neuron cores.
+   For example, if each instance is using 2 neuron cores, `preferred_batch_size`
+   could be 2, 4 or 6.
+2. `--max_queue_delay_microseconds`: Please refer to
+   [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#delayed-batching) for details.
+3. `--disable_batch_requests_to_neuron`: Enable the non-default way for Triton to
+   handle batched requests. Triton backend will send each request to neuron
+   separately, irrespective of if the Triton server requests are batched.
+   This flag is recommended when users want to optimize performance with models
+   that do not perform well with batching without the flag.
+
+Additionally, `--max_batch_size` will affect the maximum batching limit. Please
+refer to the
+[model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#maximum-batch-size)
+for details.
+
+## Testing Inferentia Setup for Accuracy
+
+The [qa folder](https://github.com/triton-inference-server/python_backend/tree/main/inferentia/qa)
+contains the necessary files to set up testing with a simple add_sub model. The test
+requires an instance with more than 8 inferentia cores to run, eg:`inf1.6xlarge`.
+start the test, run
+```
+ source <triton path>/python_backend/inferentia/qa/setup_test_enviroment_and_test.sh
+```
+where `<triton path>` is usually `/home/ubuntu`/.
+This script will pull the [server repo](https://github.com/triton-inference-server/server)
+that contains the tests for inferentia. It will then build the most recent
+Triton Server and Triton SDK.
+
+Note: If you would need to change some of the tests in the server repo,
+you would need to run
+```
+ export TRITON_SERVER_REPO_TAG=<your branch name>
+```
+before running the script.
+
+# Using Triton with Inferentia 2, or Trn1
+## pytorch-neuronx and tensorflow-neuronx
+1. Similar to the steps for inf1, change the argument to the pre-container and on-container setup scripts to include the `-inf2` or `-trn1`flags e.g.,
+```
+ chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh
+ sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh -inf2
+```
+2. On the container, followed by the `docker run` command, you can pass similar argument to the setup.sh script
+For Pytorch:
+```
+source /home/ubuntu/python_backend/inferentia/scripts/setup.sh -inf2 -p
+```
+For Tensorflow:
+```
+source /home/ubuntu/python_backend/inferentia/scripts/setup.sh -inf2 -t
+```
+3. Following the above steps, when using the `gen_triton_model.py` script, you can pass similar argument `--inf2` to the setup.sh script e.g., for Pytorch
+```
+python3 inferentia/scripts/gen_triton_model.py --inf2 --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4
+```
+4. **Note**: When using the `--inf2` option, the `--compiled_model` path should be provided relative to the triton model directory. The `initialize()` function in model.py will derive the full path by concatenating the model path within the repository and the relative `--compiled_model` path.
+## transformers-neuronx
+To use inf2/trn1 instances with transformers-neuronx packages for serving models, generate a `pytorch` model as per above instructions. The transformers-neuronx currently supports the models listed [here](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/transformers-neuronx/readme.html#currently-supported-models).
+
+As prescribed on the neuronx documentation page, while the neuronx load API differs per model, it follows the same pattern.
+
+1. To serve transformers-neuronx models, first trace the model using `save_pretrained_split()` API on an inf2 instance (recommend inf2.24xl for Large Language Models). Following that, package the folder as the '--compiled_model' when using `gen_triton_model.py` file.
+2. The following tree shows a sample model structure for OPT model:
+```
+opt/
+├── 1
+│   └── model.py
+├── opt-125m-model
+│   └── pytorch_model.bin
+└── opt-125m-tp12
+    ├── FullyUnrolled.1814.1
+    │   ├── penguin-sg0000
+    │   └── sg00
+    ├── FullyUnrolled.1814.2
+    │   ├── penguin-sg0000
+    │   └── sg00
+    ├── FullyUnrolled.1814.3
+    │   ├── penguin-sg0000
+    │   └── sg00
+    ├── FullyUnrolled.1814.4
+    │   ├── penguin-sg0000
+    │   └── sg00
+    └── FullyUnrolled.1814.5
+        ├── penguin-sg0000
+        └── sg00
+  ├── config.pbtxt
+```
+
+3. Add the following imports (e.g., for OPT model). The import will differ as per the model you're trying to run.
+```
+from transformers_neuronx.opt.model import OPTForSampling
+```
+
+4. Add the following lines in `initialize()` function. Set the `batch_size`, `tp_degree`, `n_positions`, `amp` and `unroll` args as per your requirement. `tp_degree` should typically match the number of neuron cores available on inf2 instance.
+```
+batch_size = 1
+tp_degree = 12
+n_positions = 2048
+amp = 'bf16'
+unroll = None
+self.model_neuron = OPTForSampling.from_pretrained(compiled_model, batch_size=batch_size, amp=amp, tp_degree=tp_degree, n_positions=n_positions, unroll=unroll)
+self.model_neuron.to_neuron()
+
+self.model_neuron.num_workers = num_threads
+```
+You may also chose to add the `batch_size` etc. arguments to config.pbtxt as parameters and read them in the `initialize()` function similar to `--compiled-model`.
+
+5. Finally, in the `execute()` function, use the following API to run the inference:
+```
+batched_results = self.model_neuron.sample(batched_tensor, 2048)
+```
+Above, `2048` is a sufficiently-long output token. It may also be passed in as one of the inputs if you wanto specify it as part of the payload.
+
+6. Proceed to load the model, and submit the inference payload similar to any other triton model.
\ No newline at end of file
diff --git a/inferentia/qa/Dockerfile.QA b/inferentia/qa/Dockerfile.QA
new file mode 100644
index 00000000..21f157e1
--- /dev/null
+++ b/inferentia/qa/Dockerfile.QA
@@ -0,0 +1,82 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Multistage build.
+#
+ARG BASE_IMAGE=tritonserver
+ARG BUILD_IMAGE=tritonserver_build
+ARG SDK_IMAGE=tritonserver_sdk
+ARG TRITON_PATH=/home/ubuntu
+
+FROM ${SDK_IMAGE} AS sdk
+FROM $BASE_IMAGE
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+# install platform specific packages
+RUN if [ $(cat /etc/os-release | grep 'VERSION_ID="20.04"' | wc -l) -ne 0 ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+                libpng-dev; \
+    elif [ $(cat /etc/os-release | grep 'VERSION_ID="18.04"' | wc -l) -ne 0 ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+                libpng-dev; \
+    else \
+        echo "Ubuntu version must be either 18.04 or 20.04" && \
+        exit 1; \
+    fi
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+                              python3-dev \
+                              python3-pip \
+                              build-essential \
+                              wget && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN rm -f /usr/bin/python && \
+    ln -s /usr/bin/python3 /usr/bin/python
+
+RUN pip3 install --upgrade wheel setuptools && \
+    pip3 install --upgrade numpy pillow attrdict future grpcio requests gsutil awscli six grpcio-channelz
+
+WORKDIR /opt/tritonserver
+# Copy the entire qa repo to the /opt/tritonserver/qa repo
+COPY --from=tritonserver_build /workspace/qa qa
+COPY --chown=1000:1000 --from=sdk /workspace/install client_tmp
+RUN mkdir -p qa/clients && mkdir -p qa/pkgs && \
+    cp -a client_tmp/bin/* qa/clients/. && \
+    cp client_tmp/lib/libgrpcclient.so qa/clients/. && \
+    cp client_tmp/lib/libhttpclient.so qa/clients/. && \
+    cp client_tmp/python/*.py qa/clients/. && \
+    cp client_tmp/python/triton*.whl qa/pkgs/. && \
+    cp client_tmp/java/examples/*.jar qa/clients/. && \
+    rm -rf client_tmp
+# Create mount paths for lib
+RUN mkdir /mylib && mkdir /home/ubuntu
+
+ENV TRITON_PATH ${TRITON_PATH}
+ENV LD_LIBRARY_PATH /opt/tritonserver/qa/clients:${LD_LIBRARY_PATH}
diff --git a/inferentia/qa/setup_test_enviroment_and_test.sh b/inferentia/qa/setup_test_enviroment_and_test.sh
new file mode 100755
index 00000000..cf6057ac
--- /dev/null
+++ b/inferentia/qa/setup_test_enviroment_and_test.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+export TRITON_PATH="/home/ubuntu"
+export DEFAULT_REPO_TAG=${DEFAULT_REPO_TAG:="main"}
+export TRITON_COMMON_REPO_TAG=${DEFAULT_REPO_TAG}
+export TRITON_CORE_REPO_TAG=${DEFAULT_REPO_TAG}
+export TRITON_BACKEND_REPO_TAG=${DEFAULT_REPO_TAG}
+export TRITON_THIRD_PARTY_REPO_TAG=${DEFAULT_REPO_TAG}
+export IDENTITY_BACKEND_REPO_TAG=${DEFAULT_REPO_TAG}
+export PYTHON_BACKEND_REPO_TAG=${DEFAULT_REPO_TAG}
+export CHECKSUM_REPOAGENT_REPO_TAG=${DEFAULT_REPO_TAG}
+export TRITON_SERVER_REPO_TAG=${TRITON_SERVER_REPO_TAG:=${DEFAULT_REPO_TAG}}
+export TRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG:=${DEFAULT_REPO_TAG}}
+export BASE_IMAGE=tritonserver
+export SDK_IMAGE=tritonserver_sdk
+export BUILD_IMAGE=tritonserver_build
+export QA_IMAGE=tritonserver_qa
+export TEST_JSON_REPO=/opt/tritonserver/qa/common/inferentia_perf_analyzer_input_data_json
+export TEST_REPO=/opt/tritonserver/qa/L0_inferentia_perf_analyzer
+export TEST_SCRIPT="test.sh"
+CONTAINER_NAME="qa_container"
+CONTAINER_VERSION=""
+UPSTREAM_CONTAINER_VERSION=""
+
+
+USAGE="
+usage: setup_test_enviroment_and_test.sh [options]. These setting will override exported variables
+
+Setup environment for testing on Inferentia chips and run perf analyzer tests.
+-h|--help                        Shows usage
+-d|--default-repo-tag            DEFAULT_REPO_TAG for building the test container. Default is main
+-s|--server-repo-tag             TRITON_SERVER_REPO_TAG for building test container. Default same DEFAULT_REPO_TAG
+-c|--client-repo-tag             TRITON_CLIENT_REPO_TAG for building test container. Default same DEFAULT_REPO_TAG
+-v|--container-version           Container version used in build.py. Default is container version used in build.py
+-u|--upstream-container-version  Upstream container version for test container. Default is container version used in build.py
+-p|--triton-path                 The path where python backend is located and where server repo will be cloned to. Default is /home/ubuntu
+"
+
+# Get all options:
+OPTS=$(getopt -o hd:s:c:v:u:p: --long help,default-repo-tag:,server-repo-tag:,client-repo-tag:,container-version:,upstream-container-version:,triton-path -- "$@")
+
+for OPTS; do
+    case "$OPTS" in
+        -h|--help)
+        printf "%s\\n" "$USAGE"
+        return 0
+        ;;
+        -d|--default-repo-tag)
+        export DEFAULT_REPO_TAG=$2
+        echo "Default repo tag set to: ${DEFAULT_REPO_TAG}"
+        shift 2
+        ;;
+        -s|--server-repo-tag)
+        export TRITON_SERVER_REPO_TAG=$2
+        shift 2
+        echo "Server repo tag set to: ${TRITON_SERVER_REPO_TAG}"
+        ;;
+        -c|--client-repo-tag)
+        export TRITON_CLIENT_REPO_TAG=$2
+        echo "Client repo tag set to: ${TRITON_CLIENT_REPO_TAG}"
+        shift 2
+        ;;
+        -v|--container-version)
+        export CONTAINER_VERSION=$2
+        echo "Container version set to: ${CONTAINER_VERSION}"
+        shift 2
+        ;;
+        -u|--upstream-container-version)
+        export UPSTREAM_CONTAINER_VERSION=$2
+        echo "Upstream container version set to: ${UPSTREAM_CONTAINER_VERSION}"
+        shift 2
+        ;;
+        -p|--triton-path)
+        export TRITON_PATH=$2
+        echo "Triton path set to: ${TRITON_PATH}"
+        shift 2
+        ;;
+    esac
+done
+
+cd ${TRITON_PATH}
+echo "Using server repo tag: $TRITON_SERVER_REPO_TAG"
+# Clone necessary branches
+rm -rf ${TRITON_PATH}/server
+git clone --single-branch --depth=1 -b ${TRITON_SERVER_REPO_TAG} \
+          https://github.com/triton-inference-server/server.git
+cd ${TRITON_PATH}/server
+git clone --single-branch --depth=1 -b ${TRITON_CLIENT_REPO_TAG} \
+          https://github.com/triton-inference-server/client.git clientrepo
+
+# First set up inferentia and run in detached mode
+cd ${TRITON_PATH}/python_backend
+chmod 777 ${TRITON_PATH}/python_backend/inferentia/scripts/setup-pre-container.sh
+sudo ${TRITON_PATH}/python_backend/inferentia/scripts/setup-pre-container.sh
+
+# If container version is not known, look up container version and upstream container version from build.py
+cd ${TRITON_PATH}/server
+if [ "${CONTAINER_VERSION}" = "" ]; then
+    QUERY_STRING="import build; container_version,_= build.container_versions('$(cat TRITON_VERSION)', None, None); print(container_version)"
+    CONTAINER_VERSION=$(python3 -c "${QUERY_STRING}")
+    echo "found container version: ${CONTAINER_VERSION} from build.py"
+fi
+if [ "${UPSTREAM_CONTAINER_VERSION}" = "" ]; then
+    QUERY_STRING="import build; _,upstream_container_version = build.container_versions('$(cat TRITON_VERSION)', None, None); print(upstream_container_version)"
+    UPSTREAM_CONTAINER_VERSION=$(python3 -c "${QUERY_STRING}")
+    echo "found upstream container version: ${UPSTREAM_CONTAINER_VERSION} from build.py"
+fi
+
+# Build container with only python backend
+cd ${TRITON_PATH}/server
+pip3 install docker
+./build.py --container-version=${CONTAINER_VERSION} \
+           --upstream-container-version=${UPSTREAM_CONTAINER_VERSION} \
+           --enable-logging --enable-stats --enable-tracing \
+           --enable-metrics --enable-gpu-metrics --enable-gpu \
+           --filesystem=gcs --filesystem=azure_storage --filesystem=s3 \
+           --endpoint=http --endpoint=grpc \
+           --repo-tag=common:${TRITON_COMMON_REPO_TAG} \
+           --repo-tag=core:${TRITON_CORE_REPO_TAG} \
+           --repo-tag=backend:${TRITON_BACKEND_REPO_TAG} \
+           --repo-tag=thirdparty:${TRITON_THIRD_PARTY_REPO_TAG} \
+           --backend=identity:${IDENTITY_BACKEND_REPO_TAG} \
+           --backend=python:${PYTHON_BACKEND_REPO_TAG} \
+           --repoagent=checksum:${CHECKSUM_REPOAGENT_REPO_TAG}
+docker tag tritonserver_buildbase "${BUILD_IMAGE}"
+docker tag tritonserver "${BASE_IMAGE}"
+
+# Build docker container for SDK
+docker build -t ${SDK_IMAGE} \
+             -f ${TRITON_PATH}/server/Dockerfile.sdk \
+             --build-arg "BASE_IMAGE=nvcr.io/nvidia/tritonserver:${UPSTREAM_CONTAINER_VERSION}-py3-min" \
+             --build-arg "TRITON_CLIENT_REPO_SUBDIR=clientrepo" \
+             --build-arg "TRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG}" \
+             --build-arg "TRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG}" \
+             --build-arg "TRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG}" \
+             --build-arg "TRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG}" \
+             --build-arg "NVIDIA_TRITON_SERVER_SDK_VERSION=${CONTAINER_VERSION}" .
+
+# Build QA container
+docker build -t ${QA_IMAGE} \
+                   -f ${TRITON_PATH}/python_backend/inferentia/qa/Dockerfile.QA \
+                   --build-arg "TRITON_PATH=${TRITON_PATH}" \
+                   --build-arg "BASE_IMAGE=${BASE_IMAGE}"   \
+                   --build-arg "BUILD_IMAGE=${BUILD_IMAGE}" \
+                   --build-arg "SDK_IMAGE=${SDK_IMAGE}"     .
+
+# Run pytorch instance test
+docker stop ${CONTAINER_NAME} && docker rm ${CONTAINER_NAME}
+docker create --name ${CONTAINER_NAME}             \
+            --device /dev/neuron0                  \
+            --device /dev/neuron1                  \
+            --shm-size=1g --ulimit memlock=-1      \
+            -p 8000:8000 -p 8001:8001 -p 8002:8002 \
+            --ulimit stack=67108864                \
+            -e TEST_REPO=${TEST_REPO}              \
+            -e TEST_JSON_REPO=${TEST_JSON_REPO}    \
+            -e TRITON_PATH=${TRITON_PATH}          \
+            -e USE_PYTORCH="1"                     \
+            --net host -ti ${QA_IMAGE}             \
+            /bin/bash -c "bash -ex ${TEST_REPO}/${TEST_SCRIPT}" && \
+            docker cp /lib/udev ${CONTAINER_NAME}:/mylib/udev && \
+            docker cp /home/ubuntu/python_backend ${CONTAINER_NAME}:${TRITON_PATH}/python_backend && \
+            docker start -a ${CONTAINER_NAME} || RV=$?;
+
+# Run tensorflow instance tests
+docker stop ${CONTAINER_NAME} && docker rm ${CONTAINER_NAME}
+docker create --name ${CONTAINER_NAME}             \
+            --device /dev/neuron0                  \
+            --device /dev/neuron1                  \
+            --shm-size=1g --ulimit memlock=-1      \
+            -p 8000:8000 -p 8001:8001 -p 8002:8002 \
+            --ulimit stack=67108864                \
+            -e TEST_REPO=${TEST_REPO}              \
+            -e TEST_JSON_REPO=${TEST_JSON_REPO}    \
+            -e TRITON_PATH=${TRITON_PATH}          \
+            -e USE_TENSORFLOW="1"                  \
+            --net host -ti ${QA_IMAGE}             \
+            /bin/bash -c "bash -ex ${TEST_REPO}/${TEST_SCRIPT}" && \
+            docker cp /lib/udev ${CONTAINER_NAME}:/mylib/udev && \
+            docker cp /home/ubuntu/python_backend ${CONTAINER_NAME}:${TRITON_PATH}/python_backend && \
+            docker start -a ${CONTAINER_NAME} || RV=$?;
diff --git a/inferentia/scripts/gen_triton_model.py b/inferentia/scripts/gen_triton_model.py
new file mode 100644
index 00000000..caa2450c
--- /dev/null
+++ b/inferentia/scripts/gen_triton_model.py
@@ -0,0 +1,898 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import os
+
+
+def tf_to_triton_dtype(dtype):
+    import tensorflow as tf
+
+    if dtype == tf.float16:
+        return "FP16"
+    elif dtype == tf.float32:
+        return "FP32"
+    elif dtype == tf.float64:
+        return "FP64"
+    elif dtype == tf.int8:
+        return "INT8"
+    elif dtype == tf.uint8:
+        return "UINT8"
+    elif dtype == tf.uint16:
+        return "UINT16"
+    elif dtype == tf.uint32:
+        return "UINT32"
+    elif dtype == tf.uint64:
+        return "UINT64"
+    elif dtype == tf.int16:
+        return "INT16"
+    elif dtype == tf.int32:
+        return "INT32"
+    elif dtype == tf.int64:
+        return "INT64"
+    elif dtype == tf.bool:
+        return "BOOL"
+    elif dtype == tf.string:
+        return "STRING"
+
+    raise Exception("The data type in the TF model is not supported")
+
+
+def parse_tf_tensors(saved_model_dir, tag_set, signature_def_key):
+    from tensorflow.python.tools import saved_model_utils
+
+    meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir, tag_set)
+
+    input_dict = {}
+    input_signatures = list(
+        meta_graph_def.signature_def[signature_def_key].inputs.values()
+    )
+    for input_signature in input_signatures:
+        datatype = tf_to_triton_dtype(input_signature.dtype)
+        shape = []
+        for dim in input_signature.tensor_shape.dim:
+            shape.append(dim.size)
+        input_dict[input_signature.name] = [datatype, shape]
+
+    output_dict = {}
+    output_signatures = list(
+        meta_graph_def.signature_def[signature_def_key].outputs.values()
+    )
+    for output_signature in output_signatures:
+        datatype = tf_to_triton_dtype(output_signature.dtype)
+        shape = []
+        for dim in output_signature.tensor_shape.dim:
+            shape.append(dim.size)
+        output_dict[output_signature.name] = [datatype, shape]
+    return input_dict, output_dict
+
+
+def parse_io_tensors(tensors):
+    tensors_dict = {}
+    for t in [t for tensor in tensors for t in tensor]:
+        name, datatype, shape_str = t.split(",")
+        shape = [int(i) for i in shape_str.split("x")]
+        tensors_dict[name] = [datatype, shape]
+
+    return tensors_dict
+
+
+def get_parameter_spec(key1, value):
+    param_spec = 'parameters: {{key: "{}", value: {{string_value: "{}"}}}} \n'.format(
+        key1, value
+    )
+
+    return param_spec
+
+
+def create_modelconfig(
+    model_name,
+    max_batch_size,
+    inputs,
+    outputs,
+    compiled_model_path,
+    nc_start_idx,
+    nc_end_idx,
+    threads_per_core,
+    instance_count,
+    enable_dynamic_batching,
+    preferred_batch_size,
+    max_queue_delay_microseconds,
+):
+    config = 'name: "{}"\n'.format(model_name)
+    config += 'backend: "python"\n'
+    config += "max_batch_size: {}\n".format(max_batch_size)
+    if enable_dynamic_batching:
+        config += """
+dynamic_batching {
+"""
+        if preferred_batch_size is not None:
+            config += """
+    preferred_batch_size: {}
+""".format(
+                preferred_batch_size
+            )
+        if max_queue_delay_microseconds is not None:
+            config += """
+    max_queue_delay_microseconds: {}
+""".format(
+                max_queue_delay_microseconds
+            )
+        config += """
+}\n"""
+    for input_name in inputs.keys():
+        data_type, shape = inputs[input_name]
+        config += """
+input [
+  {{
+    name: \"{}\"
+    data_type: {}
+    dims: {}
+  }}
+]\n""".format(
+            input_name, "TYPE_" + data_type, shape
+        )
+    for output_name in outputs.keys():
+        data_type, shape = outputs[output_name]
+        config += """
+output [
+  {{
+    name: \"{}\"
+    data_type: {}
+    dims: {}
+  }}
+]\n""".format(
+            output_name, "TYPE_" + data_type, shape
+        )
+    config += """
+instance_group [
+    {{
+        kind: KIND_MODEL
+        count: {}
+    }}
+]\n""".format(
+        instance_count
+    )
+    config += get_parameter_spec("COMPILED_MODEL", compiled_model_path)
+    config += get_parameter_spec("NEURON_CORE_START_INDEX", nc_start_idx)
+    config += get_parameter_spec("NEURON_CORE_END_INDEX", nc_end_idx)
+    config += get_parameter_spec("NUM_THREADS_PER_CORE", threads_per_core)
+    return config
+
+
+def get_model_license():
+    lic = """# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+    """
+    return lic
+
+
+def get_common_initialize_impl():
+    init_impl = '''
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args['model_config'])
+
+        if (len(model_config['instance_group']) != 1):
+            raise pb_utils.TritonModelException(
+                "this model supports only a single instance group, got {}".
+                format(len(model_config['instance_group'])))
+
+        instance_group_config = model_config['instance_group'][0]
+        instance_count = instance_group_config['count']
+
+        instance_idx = 0
+        if instance_count > 1:
+            instance_name_parts = args['model_instance_name'].split("_")
+            if not instance_name_parts[-1].isnumeric():
+                raise pb_utils.TritonModelException(
+                    "internal error: the model instance name should end with '_<instance_idx>', got {}"
+                    .format(args['model_instance_name']))
+            instance_idx = int(instance_name_parts[-1])
+
+        params = model_config['parameters']
+        compiled_model = params['COMPILED_MODEL']['string_value']
+
+        nc_start_idx = int(params['NEURON_CORE_START_INDEX']['string_value'])
+        nc_end_idx = int(params['NEURON_CORE_END_INDEX']['string_value'])
+        if nc_end_idx < nc_start_idx:
+            raise pb_utils.TritonModelException(
+                "the neuron core end index should be greater than or equal to the start index"
+            )
+
+        threads_per_core = int(params['NUM_THREADS_PER_CORE']['string_value'])
+        if threads_per_core < 1:
+            raise pb_utils.TritonModelException(
+                "the number of threads per core should be greater than or equal to 1"
+            )
+        num_threads = (nc_end_idx - nc_start_idx + 1) * threads_per_core
+
+        total_core_count = nc_end_idx - nc_start_idx + 1
+        if (instance_count > total_core_count):
+            raise pb_utils.TritonModelException(
+                "can not distribute {} triton model instances to {} neuron cores"
+                .format(instance_count, total_core_count))
+        cores_per_instance = total_core_count // instance_count
+'''
+    return init_impl
+
+
+def get_tensorflow_initialize_impl(is_inf2=False):
+    init_impl = get_common_initialize_impl()
+    init_impl += """
+        self.input_list = []
+        for config_input in model_config['input']:
+            self.input_list.append(
+                (config_input['name'], config_input['data_type'],
+                 config_input['dims']))
+
+        self.output_list = []
+        for config_output in model_config['output']:
+            self.output_list.append(
+                (config_output['name'], config_output['data_type'],
+                 config_output['dims']))
+
+        os.environ["NEURON_RT_NUM_CORES"] = str(cores_per_instance)
+"""
+    if is_inf2:
+        init_impl += """
+        compiled_model = os.path.join(args['model_repository'], compiled_model)
+        self.pred_list = [
+            tf.keras.models.load_model(compiled_model)
+            for _ in range(cores_per_instance)
+        ] * threads_per_core
+"""
+    else:
+        init_impl += """
+        self.pred_list = [
+            tf.contrib.predictor.from_saved_model(compiled_model)
+            for _ in range(cores_per_instance)
+        ] * threads_per_core
+"""
+    return init_impl
+
+
+def get_pytorch_initialize_impl(is_inf2=False):
+    init_impl = """
+    def _validate_and_get_index(self, name):
+        parts = name.split('__')
+        if len(parts) != 2:
+            raise pb_utils.TritonModelException(
+                "tensor names are expected to be in format <name>__<index>, got {}"
+                .format(name))
+
+        if not parts[1].isnumeric():
+            raise pb_utils.TritonModelException(
+                "tensor names are expected to be in format <name>__<index> where <index> should be numeric, got {}"
+                .format(name))
+
+        return int(parts[1])
+
+    def _validate_input_dict(self, expected_count):
+        for i in range(expected_count):
+            if i not in self.input_dict:
+                raise pb_utils.TritonModelException(
+                    "input corresponding to index {} not found".format(i))
+
+    def _validate_output_dict(self, expected_count):
+        for i in range(expected_count):
+            if i not in self.output_dict:
+                raise pb_utils.TritonModelException(
+                    "output corresponding to index {} not found".format(i))
+"""
+    init_impl += get_common_initialize_impl()
+    init_impl += """
+        self.input_dict = {}
+        expected_input_count = 0
+        for config_input in model_config['input']:
+            index = self._validate_and_get_index(config_input['name'])
+            self.input_dict[index] = [
+                config_input['name'], config_input['data_type'],
+                config_input['dims']
+            ]
+            expected_input_count += 1
+        self._validate_input_dict(expected_input_count)
+
+        self.output_dict = {}
+        for config_output in model_config['output']:
+            index = self._validate_and_get_index(config_output['name'])
+            self.output_dict[index] = [
+                config_output['name'], config_output['data_type'],
+                config_output['dims']
+            ]
+
+        adjusted_nc_start_idx = (instance_idx *
+                                 cores_per_instance) + nc_start_idx
+        cores_range = '{}-{}'.format(
+            adjusted_nc_start_idx,
+            (adjusted_nc_start_idx + cores_per_instance - 1))
+        os.environ["NEURON_RT_VISIBLE_CORES"] = cores_range
+
+        consumed_cores_list = [i for i in range(cores_per_instance)]
+"""
+    if is_inf2:
+        init_impl += """
+        compiled_model = os.path.join(args['model_repository'], compiled_model)
+        self.model_neuron = torch.jit.load(compiled_model)
+"""
+    else:
+        init_impl += """
+        self.model_neuron = torch.neuron.DataParallel(
+        torch.jit.load(compiled_model), device_ids=consumed_cores_list)
+"""
+    init_impl += """
+        self.model_neuron.num_workers = num_threads
+"""
+    return init_impl
+
+
+def get_tensorflow_execute_impl(disable_batch_requests_to_neuron):
+    exec_impl = '''
+    def _one_thread(self, pred, model_feed_dict):
+        result = pred(model_feed_dict)
+        return result
+
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+'''
+    if disable_batch_requests_to_neuron:
+        exec_impl += """
+        responses = []
+        num_threads = len(self.pred_list)
+        model_feed_dict_list = [{} for _ in range(num_threads)]
+        for request in requests:
+            inputs = []
+            for i in range(len(self.input_list)):
+                name, dt, shape = self.input_list[i]
+                tensor = pb_utils.get_input_tensor_by_name(request,
+                                                           name).as_numpy()
+                split_tensor = [None] * num_threads
+                for split_index in range(num_threads):
+                    model_feed_dict_list[split_index][name] = np.array_split(
+                        tensor, num_threads, axis=0)[split_index]
+            executor = futures.ThreadPoolExecutor(max_workers=num_threads)
+            running = {
+                executor.submit(self._one_thread, self.pred_list[idx],
+                                model_feed_dict_list[idx]): idx
+                for idx in range(num_threads)
+            }
+            results = [None] * num_threads
+            for future in futures.as_completed(running):
+                idx = running[future]
+                results[idx] = future.result()
+            output_tensors = []
+            for i in range(len(self.output_list)):
+                name, dt, shape = self.output_list[i]
+                out_list = [None] * num_threads
+                for idx in range(num_threads):
+                    out_list[idx] = results[idx][name]
+                full_tensor = out_list[0]
+                for idx in range(num_threads - 1):
+                    full_tensor = np.concatenate(
+                        (full_tensor, out_list[idx + 1]), axis=0)
+                output_tensor = pb_utils.Tensor(
+                    name,
+                    full_tensor.astype(pb_utils.triton_string_to_numpy(dt)))
+                output_tensors.append(output_tensor)
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=output_tensors)
+            responses.append(inference_response)
+        return responses
+"""
+    else:
+        exec_impl += """
+        responses = []
+        num_threads = len(self.pred_list)
+        model_feed_dict_list = [{} for _ in range(num_threads)]
+        num_requests = len(requests)
+        request_batch_sizes = []
+        inputs = []
+        for i in range(len(self.input_list)):
+            name, dt, shape = self.input_list[i]
+            first_tensor = pb_utils.get_input_tensor_by_name(requests[0], name).as_numpy()
+            request_batch_sizes.append(np.size(first_tensor, axis=0))
+            batched_tensor = first_tensor
+            for j in range(1, num_requests):
+                tensor = pb_utils.get_input_tensor_by_name(requests[j],
+                                                            name).as_numpy()
+                request_batch_sizes.append(request_batch_sizes[-1] + np.size(tensor, axis=0))
+                batched_tensor = np.concatenate((batched_tensor, tensor), axis=0)
+            split_tensor = [None] * num_threads
+            for split_index in range(num_threads):
+                model_feed_dict_list[split_index][name] = np.array_split(
+                    batched_tensor, num_threads, axis=0)[split_index]
+
+        executor = futures.ThreadPoolExecutor(max_workers=num_threads)
+        running = {
+            executor.submit(self._one_thread, self.pred_list[idx],
+                            model_feed_dict_list[idx]): idx
+            for idx in range(num_threads)
+        }
+
+        results = [None] * num_threads
+        for future in futures.as_completed(running):
+            idx = running[future]
+            results[idx] = future.result()
+
+        chuncky_tensors = []
+        for i in range(len(self.output_list)):
+            name, dt, shape = self.output_list[i]
+            out_list = [None] * num_threads
+            for idx in range(num_threads):
+                out_list[idx] = results[idx][name]
+            full_tensor = out_list[0]
+            for idx in range(num_threads - 1):
+                full_tensor = np.concatenate(
+                    (full_tensor, out_list[idx + 1]), axis=0)
+            chuncky_tensors.append(np.split(full_tensor, request_batch_sizes, axis=0))
+
+        for i in range(num_requests):
+            output_tensors = []
+            for j in range(len(self.output_list)):
+                name, dt, shape = self.output_list[j]
+                tensor = chuncky_tensors[j][i]
+                output_tensor = pb_utils.Tensor(
+                    name,
+                    tensor.astype(pb_utils.triton_string_to_numpy(dt)))
+                output_tensors.append(output_tensor)
+
+            inference_response = pb_utils.InferenceResponse(output_tensors=output_tensors)
+            responses.append(inference_response)
+
+        return responses
+"""
+    return exec_impl
+
+
+def get_pytorch_execute_impl(disable_batch_requests_to_neuron):
+    exec_impl = '''
+    def execute(self, requests):
+        """`execute` MUST be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference request is made
+        for this model. Depending on the batching configuration (e.g. Dynamic
+        Batching) used, `requests` may contain multiple requests. Every
+        Python model, must create one pb_utils.InferenceResponse for every
+        pb_utils.InferenceRequest in `requests`. If there is an error, you can
+        set the error argument when creating a pb_utils.InferenceResponse
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+'''
+    if disable_batch_requests_to_neuron:
+        exec_impl += """
+        responses = []
+        for request in requests:
+            inputs = []
+            for i in range(len(self.input_dict)):
+                name, dt, shape = self.input_dict[i]
+                tensor = torch.as_tensor(pb_utils.get_input_tensor_by_name(request,
+                                                           name).as_numpy())
+                inputs.append(tensor)
+            results = self.model_neuron(*inputs)
+            output_tensors = []
+            for i in self.output_dict.keys():
+                name, dt, shape = self.output_dict[i]
+                result = results[i] if isinstance(results, tuple) else results
+                output_tensor = pb_utils.Tensor(
+                    name, result.numpy().astype(
+                        pb_utils.triton_string_to_numpy(dt)))
+                output_tensors.append(output_tensor)
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=output_tensors)
+            responses.append(inference_response)
+        return responses
+"""
+    else:
+        exec_impl += """
+        responses = []
+        inputs = []
+        num_requests = len(requests)
+        request_batch_sizes = []
+        for i in self.input_dict.keys():
+            name, dt, shape = self.input_dict[i]
+            first_tensor = torch.as_tensor(pb_utils.get_input_tensor_by_name(requests[0],
+                                                            name).as_numpy())
+            request_batch_sizes.append(first_tensor.size(dim=0))
+            batched_tensor = first_tensor
+            for j in range(1, num_requests):
+                tensor = torch.as_tensor(pb_utils.get_input_tensor_by_name(requests[j],
+                                                            name).as_numpy())
+                request_batch_sizes.append(request_batch_sizes[-1] + tensor.size(dim=0))
+                batched_tensor = torch.cat((batched_tensor, tensor), dim=0)
+            inputs.append(batched_tensor)
+
+        batched_results = self.model_neuron(*inputs)
+        chunky_batched_results = []
+        for i in self.output_dict.keys():
+            batch = batched_results[i] if isinstance(batched_results, tuple) else batched_results
+            chunky_batched_results.append(torch.tensor_split(batch, request_batch_sizes, dim=0))
+        for i in range(num_requests):
+            output_tensors = []
+            for j in self.output_dict.keys():
+                name, dt, shape = self.output_dict[j]
+                result = chunky_batched_results[j][i]
+                output_tensor = pb_utils.Tensor(
+                    name, result.numpy().astype(
+                        pb_utils.triton_string_to_numpy(dt)))
+                output_tensors.append(output_tensor)
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=output_tensors)
+            responses.append(inference_response)
+
+        return responses
+"""
+    return exec_impl
+
+
+def get_finalize_impl():
+    finalize_impl = '''
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is OPTIONAL. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')
+
+'''
+    return finalize_impl
+
+
+def get_triton_python_model_impl(
+    using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2=False
+):
+    triton_pmi = '''
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    '''
+
+    if using_tensorflow_model:
+        triton_pmi += get_tensorflow_initialize_impl(is_inf2)
+        triton_pmi += get_tensorflow_execute_impl(disable_batch_requests_to_neuron)
+    else:
+        triton_pmi += get_pytorch_initialize_impl(is_inf2)
+        triton_pmi += get_pytorch_execute_impl(disable_batch_requests_to_neuron)
+
+    triton_pmi += get_finalize_impl()
+
+    return triton_pmi
+
+
+def create_model_file(
+    using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2=False
+):
+    triton_model = get_model_license()
+    triton_model += """
+import json
+import numpy as np
+import os
+import sys
+import triton_python_backend_utils as pb_utils
+"""
+
+    if using_tensorflow_model:
+        triton_model += """
+import tensorflow as tf
+from concurrent import futures
+"""
+    else:
+        triton_model += """
+import torch
+    """
+        if not is_inf2:
+            triton_model += """
+import torch.neuron
+        """
+        else:
+            triton_model += """
+import torch_neuronx
+"""
+    triton_model += get_triton_python_model_impl(
+        using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2
+    )
+    return triton_model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--inf2",
+        required=False,
+        default=False,
+        action="http://github.com/store_true",
+        help="Specify whether the model should be generate for inf2 or inf1, default is inf1",
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        required=True,
+        choices=["pytorch", "tensorflow"],
+        help="""The type of the compiled model. Currently,
+                    only supports \"pytorch\" and \"tensorflow\".""",
+    )
+    parser.add_argument(
+        "--model_version", type=int, default=1, help="The version of the model"
+    )
+    parser.add_argument(
+        "--enable_dynamic_batching",
+        action="http://github.com/store_true",
+        help="""Enable dynamic batching. Please see model configuration
+        documentation for details:
+        https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#dynamic-batcher""",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=0,
+        help="""The maximum batch size for the model being generated.
+        Please see model configuration documentation for details:
+        https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size""",
+    )
+    parser.add_argument(
+        "--preferred_batch_size",
+        type=int,
+        help="""The preferred batch size. Should be multiples
+        of cores available to ensure proper utilization of
+        neuron cores.
+        This flag is ignored if --enable_dynamic_batching is
+        not specified. Please see model configuration
+        documentation for details:
+        https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#preferred-batch-sizes""",
+    )
+    parser.add_argument(
+        "--max_queue_delay_microseconds",
+        type=int,
+        help="""Max queue delay time(ms) for dynamic batching.
+        This flag is ignored if --enable_dynamic_batching is not specified.
+        Please see model configuration documentation for details:
+        https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#delayed-batching""",
+    )
+    parser.add_argument(
+        "--disable_batch_requests_to_neuron",
+        action="http://github.com/store_true",
+        help="""Send each request separately to neuron if enabled.
+                         If not specified, then requests are combined and sent to
+                         neuron as a single batch""",
+    )
+    parser.add_argument(
+        "--tag_set",
+        type=str,
+        default="serve",
+        help="""The tag set to use for the TF model.
+                        This option is ignored if `--model_type` is
+                        not \"tensorflow\". Default value is \'serve\'.""",
+    )
+    parser.add_argument(
+        "--signature_def_key",
+        type=str,
+        default="serving_default",
+        help="""The signature def key to use for the TF
+                        model. This option is ignored if `--model_type`
+                        is not \"tensorflow\". Default value
+                        is \'serving_default\'.""",
+    )
+    parser.add_argument(
+        "--compiled_model",
+        type=str,
+        required=True,
+        help="Fullpath to the compiled model",
+    )
+    parser.add_argument(
+        "--triton_input",
+        type=str,
+        action="http://github.com/append",
+        nargs="*",
+        help="""The name, datatype and shape of the model input in
+        format <input_name>,<triton_datatype>,<shape>. This
+        option can be provided multiple times for multiple
+        inputs. For example, to provide a FP16 input with
+        shape [1,384] specify the following: INPUT0,FP16,1x384.
+        This option is not required when using tensorflow model""",
+    )
+    parser.add_argument(
+        "--triton_output",
+        type=str,
+        action="http://github.com/append",
+        nargs="*",
+        help="""The name, datatype and shape of the model output in
+        format <output_name>,<triton_datatype>,<shape>. This
+        option can be provided multiple times for multiple
+        outputs. For example, to provide a FP16 output with
+        shape [1,384] specify the following: OUTPUT0,FP16,1x384.
+        This option is not required when using tensorflow model""",
+    )
+    parser.add_argument(
+        "--neuron_core_range",
+        type=str,
+        required=True,
+        help="""The range of neuron core indices
+                        where the model needs to be loaded. The
+                        range should be specified in format
+                        <start_idx>:<end_idx>. For example to
+                        load model on neuron cores (0-7), specify
+                        the following: 0:7. NOTE: when using
+                        multiple triton model instances the neuron
+                        cores will get equally distributed. Assuming
+                        the instance count is 4, Instance0 will get
+                        loaded on cores 0:1, Instance1 will get loaded
+                        on cores 2:3, Instance2 will get loaded on
+                        cores 4:5 and Instance 3 will get loaded on
+                        cores 6:7""",
+    )
+    parser.add_argument(
+        "--threads_per_core",
+        type=int,
+        default=1,
+        help="The number of threads per neuron core.",
+    )
+    parser.add_argument(
+        "--triton_model_instance_count",
+        type=int,
+        default=1,
+        help="The number of triton model instances.",
+    )
+    parser.add_argument(
+        "--triton_model_dir",
+        type=str,
+        required=True,
+        help="""Path to the triton model
+                        directory where script will generate
+                        config.pbtxt and model.py""",
+    )
+    FLAGS, unparsed = parser.parse_known_args()
+    if len(unparsed) > 0:
+        raise Exception("Unrecognized options: {}".format(unparsed))
+
+    if FLAGS.model_type == "tensorflow":
+        is_tensorflow_model = True
+    elif FLAGS.model_type == "pytorch":
+        is_tensorflow_model = False
+
+    print(
+        """Triton Dynamic Batching is enabled: {},
+        preferred_batch_size: {} and max_batch_size: {}
+        with max_queue_delay_microseconds: {}.
+        Batch requests to neruon are disabled: {}""".format(
+            FLAGS.enable_dynamic_batching,
+            FLAGS.preferred_batch_size,
+            FLAGS.max_batch_size,
+            FLAGS.max_queue_delay_microseconds,
+            FLAGS.disable_batch_requests_to_neuron,
+        )
+    )
+
+    if not is_tensorflow_model or (
+        FLAGS.triton_input != None and FLAGS.triton_output != None
+    ):
+        inputs = parse_io_tensors(FLAGS.triton_input)
+        outputs = parse_io_tensors(FLAGS.triton_output)
+    else:
+        inputs, outputs = parse_tf_tensors(
+            FLAGS.compiled_model, FLAGS.tag_set, FLAGS.signature_def_key
+        )
+
+    nc_start_idx, nc_end_idx = [int(i) for i in FLAGS.neuron_core_range.split(":")]
+
+    model_version_dir = FLAGS.triton_model_dir + "/" + str(FLAGS.model_version)
+    try:
+        os.makedirs(model_version_dir)
+    except OSError as ex:
+        pass  # ignore existing dir
+
+    model_name = os.path.basename(FLAGS.triton_model_dir)
+    mc = create_modelconfig(
+        model_name,
+        FLAGS.max_batch_size,
+        inputs,
+        outputs,
+        FLAGS.compiled_model,
+        nc_start_idx,
+        nc_end_idx,
+        FLAGS.threads_per_core,
+        FLAGS.triton_model_instance_count,
+        FLAGS.enable_dynamic_batching,
+        FLAGS.preferred_batch_size,
+        FLAGS.max_queue_delay_microseconds,
+    )
+    with open(FLAGS.triton_model_dir + "/config.pbtxt", "w") as config_file:
+        config_file.write(mc)
+
+    is_inf2 = FLAGS.inf2
+
+    mf = create_model_file(
+        is_tensorflow_model, FLAGS.disable_batch_requests_to_neuron, is_inf2
+    )
+    with open(FLAGS.triton_model_dir + "/1/model.py", "w") as model_file:
+        model_file.write(mf)
diff --git a/inferentia/scripts/setup-pre-container.sh b/inferentia/scripts/setup-pre-container.sh
new file mode 100755
index 00000000..f6f5ae16
--- /dev/null
+++ b/inferentia/scripts/setup-pre-container.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#! /bin/sh
+
+USAGE="
+usage: setup.sh [options]
+
+Sets up runtime and tools for execution on Inferentia chips.
+-h|--help                  Shows usage
+-inf1|--inf1-setup         Installs runtime and tools for inf1/neuron, inf1 is default
+-inf2|--inf2-setup         Installs runtime and tools for inf2/neuronx
+-trn1|--trn1-setup         Installs runtime, tools for inf2, and installs EFA for trn1
+"
+
+# Get all options:
+OPTS=$(getopt -o hb:v:i:tp --long help,python-backend-path:,python-version:,inferentia-path:,use-tensorflow,use-pytorch,tensorflow-version: -- "$@")
+
+
+export INSTALL_INF2=0
+export INSTALL_INF1=1
+export INSTALL_TRN1=0
+
+export CWD=`pwd`
+
+cd /home/ubuntu
+
+for OPTS; do
+    case "$OPTS" in
+        -h|--help)
+        printf "%s\\n" "$USAGE"
+        return 0
+        ;;
+        -inf1|--inf1-setup)
+        INSTALL_INF1=1
+        echo "Script will install runtime and tools for inf1/neuron"
+        shift 1
+        ;;
+        -inf2|--inf2-setup)
+        INSTALL_INF2=1
+        shift 1
+        echo "Script will install runtime and tools for inf2/neruonx"
+        ;;
+        -trn1|--trn1-setup)
+        INSTALL_TRN1=1
+        echo "Script will install runtime and tools for trn1"
+        shift 1
+        ;;
+    esac
+done
+
+if [ ${INSTALL_INF1} -ne 1 ] && [ ${INSTALL_INF2} -ne 1 ]; then
+    echo "Error: need to specify either -inf1, -inf2 or -trn1."
+    printf "source %s\\n" "$USAGE"
+    return 1
+fi
+
+if [ ${INSTALL_INF1} -eq 1 ] && [ ${INSTALL_INF2} -eq 1]
+then
+    echo "Error: cannot install both inf1 and inf2 dependencies. Please select either -inf1 or -inf2."
+    return 1
+fi
+
+if [ ${INSTALL_INF1} -eq 1 ] && [ ${INSTALL_TRN1} -eq 1 ]
+then
+    echo "Error: cannot install both inf1 and trn1 dependencies. Selecting -trn1 will install inf2 dependencies and EFA."
+fi
+
+# First stop and remove old neuron 1.X runtime
+sudo systemctl stop neuron-rtd || true
+sudo apt remove aws-neuron-runtime -y || true
+
+# Then install new neuron libraries
+. /etc/os-release
+sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+EOF
+sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB |  apt-key add -
+
+
+sudo apt-get install -y \
+    linux-headers-$(uname -r) \
+    git \
+    aws-neuronx-dkms=2.* \
+    aws-neuronx-tools=2.* \
+    aws-neuronx-collectives=2.* -y \
+    aws-neuronx-runtime-lib=2.* -y
+
+echo "Installation complete for inf2 runtime and tools."
+
+if [ ${INSTALL_TRN1} -eq 1 ]
+then
+    # Install EFA Driver (only required for multi-instance training)
+    curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
+    wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key
+    cat aws-efa-installer.key | gpg --fingerprint
+    wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig
+    tar -xvf aws-efa-installer-latest.tar.gz
+    cd aws-efa-installer && sudo bash efa_installer.sh --yes
+    cd
+    sudo rm -rf aws-efa-installer-latest.tar.gz aws-efa-installer
+fi
+
+ # Add PATH
+export PATH=/opt/aws/neuron/bin:$PATH
+cd ${CWD}
\ No newline at end of file
diff --git a/inferentia/scripts/setup.sh b/inferentia/scripts/setup.sh
new file mode 100755
index 00000000..cc295530
--- /dev/null
+++ b/inferentia/scripts/setup.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+USAGE="
+usage: setup.sh [options]
+
+Sets up python execution environment for AWS Neuron SDK for execution on Inferentia chips.
+-h|--help                  Shows usage
+-b|--python-backend-path   Python backend path, default is: /home/ubuntu/python_backend
+-v|--python-version        Python version, default is 3.7
+-i|--inferentia-path       Inferentia path, default is: /home/ubuntu
+-p|--use-pytorch           Install pytorch-neuron if specified
+-t|--use-tensorflow        Install tensorflow-neuron is specified
+-inf2|--inf2-setup         Install pytorch or tensorflow neuronx packages for inf2, inf2 is default
+-inf1|--inf1-setup         Install pytorch of tensorflow neuron packages for inf1
+--tensorflow-version       Version of Tensorflow used. Default is 2. Ignored if installing pytorch-neuron
+"
+
+# Get all options:
+OPTS=$(getopt -o hb:v:i:tp --long help,python-backend-path:,python-version:,inferentia-path:,use-tensorflow,use-pytorch,tensorflow-version: -- "$@")
+
+
+export INFERENTIA_PATH=${TRITON_PATH:="/home/ubuntu"}
+export PYTHON_BACKEND_PATH="/home/ubuntu/python_backend"
+export PYTHON_VERSION=3.7
+export USE_PYTORCH=0
+export USE_TENSORFLOW=0
+export TENSORFLOW_VERSION=2
+export INSTALL_INF1=1
+export INSTALL_INF2=0
+
+for OPTS; do
+    case "$OPTS" in
+        -h|--help)
+        printf "%s\\n" "$USAGE"
+        return 0
+        ;;
+        -b|--python-backend-path)
+        PYTHON_BACKEND_PATH=$2
+        echo "Python backend path set to ${PYTHON_BACKEND_PATH}"
+        shift 2
+        ;;
+        -v|--python-version)
+        PYTHON_VERSION=$2
+        shift 2
+        echo "Python version set to ${PYTHON_VERSION}"
+        ;;
+        -i|--inferentia-path)
+        INFERENTIA_PATH=$2
+        echo "Inferentia path set to ${INFERENTIA_PATH}"
+        shift 2
+        ;;
+        -t|--use-tensorflow)
+        USE_TENSORFLOW=1
+        echo "Installing tensorflow neuronx packages"
+        shift 1
+        ;;
+        -p|--use-pytorch)
+        USE_PYTORCH=1
+        echo "Installing pytorch neuronx packages"
+        shift 1
+        ;;
+        --tensorflow-version)
+        TENSORFLOW_VERSION=$2
+        echo "Tensorflow version: ${TENSORFLOW_VERSION}"
+        shift 2
+        ;;
+        -inf1|--inf1-setup)
+        INSTALL_INF1=1
+        INSTALL_INF2=0
+        echo "Installing framework and tools for inf1."
+        shift 1
+        ;;
+        -inf2|--inf2-setup)
+        INSTALL_INF2=1
+        INSTALL_INF1=0
+        echo "Installing framework and tools for inf2"
+        shift 1
+        ;;
+        -trn1|--trn1-setup)
+        INSTALL_INF2=1 # same frameworks are used for inf2 and trn1
+        INSTALL_INF1=0
+        echo "Installing framework and tools for trn1/inf2"
+        shift 1
+        ;;
+    esac
+done
+
+
+if [ ${USE_TENSORFLOW} -ne 1 ] && [ ${USE_PYTORCH} -ne 1 ]; then
+    echo "Error: need to specify either -p (use pytorch) or -t (use tensorflow)."
+    printf "%s\\n" "${USAGE}"
+    return 1
+fi
+
+if [ ${USE_TENSORFLOW} -eq 1 ] && [ ${USE_PYTORCH} -eq 1 ]; then
+    echo "Error: can specify only one of -p (use pytorch) or -t (use tensorflow)."
+    printf "%s\\n" "${USAGE}"
+    return 1
+fi
+
+if [ ${USE_TENSORFLOW} -eq 1 ]; then
+    if [ ${TENSORFLOW_VERSION} -ne 1 ] && [ ${TENSORFLOW_VERSION} -ne 2 ]; then
+        echo "Error: need to specify --tensorflow-version to be 1 or 2. TENSORFLOW_VERSION currently is: ${TENSORFLOW_VERSION}"
+        printf "%s\\n" "${USAGE}"
+        return 1
+    fi
+fi
+
+# Install python_backend_stub installing dependencies
+apt-get update && \
+    apt-get install -y --no-install-recommends \
+              zlib1g-dev \
+              wget \
+              libarchive-dev   \
+              rapidjson-dev
+
+
+# Set Pip repository  to point to the Neuron repository
+# since we need to use pip to update:
+#  https://aws.amazon.com/blogs/developer/neuron-conda-packages-eol/
+pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+pip install --upgrade pip
+
+if [ ${INSTALL_INF2} -eq 1 ];then
+    # Install Neuron Runtime
+    # Then install new neuron libraries
+    . /etc/os-release
+    tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+EOF
+    wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB |  apt-key add -
+    apt-get update
+    apt-get install -y aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.*
+fi
+
+
+if [ ${USE_TENSORFLOW} -eq 1 ]; then
+    # Update Neuron TensorFlow
+    if [ ${INSTALL_INF1} -eq 1 ] && [ ${TENSORFLOW_VERSION} -eq 1 ]; then
+        pip install --upgrade tensorflow-neuron==1.15.5.* neuron-cc
+    elif [ ${INSTALL_INF1} -eq 1 ]; then
+        pip install --upgrade tensorflow-neuron[cc]
+    elif [ ${INSTALL_INF2} -eq 1 ] && [ ${TENSORFLOW_VERSION} -eq 1 ]; then
+        pip install --upgrade neuronx-cc==2.* tensorflow-neuronx==1.* tensorboard-plugin-neuronx
+    elif [ ${INSTALL_INF2} -eq 1 ]; then
+        pip install --upgrade neuronx-cc==2.* tensorflow-neuronx==2.* tensorboard-plugin-neuronx
+    fi
+fi
+
+if [ ${USE_PYTORCH} -eq 1 ];then
+    # conda install torch-neuron torchvision -y
+    # Upgrade torch-neuron and install transformers
+    if [ ${INSTALL_INF1} -eq 1 ]; then
+        pip install --upgrade torch-neuron neuron-cc[tensorflow] "protobuf" torchvision "transformers==4.6.0"
+    elif [ ${INSTALL_INF2} -eq 1 ]; then
+        pip install --upgrade neuronx-cc==2.* torch-neuronx torchvision transformers-neuronx
+    fi
+fi
+
+# Upgrade the rules and sockets
+cp /mylib/udev/rules.d/* /lib/udev/rules.d/
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..5e8749f8
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,48 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+[tool.codespell]
+# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
+# this is only to allow you to run codespell interactively
+skip = "./.git,./.github"
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
+# use the 'clear' dictionary for unambiguous spelling mistakes
+builtin = "clear"
+# disable warnings about binary files and wrong encoding
+quiet-level = 3
+
+[tool.isort]
+profile = "black"
+use_parentheses = true
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+ensure_newline_before_comments = true
+line_length = 88
+balanced_wrapping = true
+indent = "    "
+skip = ["build"]
diff --git a/src/correlation_id.cc b/src/correlation_id.cc
new file mode 100644
index 00000000..d7b19eea
--- /dev/null
+++ b/src/correlation_id.cc
@@ -0,0 +1,120 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "correlation_id.h"
+
+namespace triton { namespace backend { namespace python {
+
+CorrelationId::CorrelationId()
+    : id_string_(""), id_uint_(0), id_type_(CorrelationIdDataType::UINT64)
+{
+}
+
+CorrelationId::CorrelationId(const std::string& id_string)
+    : id_string_(id_string), id_uint_(0),
+      id_type_(CorrelationIdDataType::STRING)
+{
+}
+
+CorrelationId::CorrelationId(uint64_t id_uint)
+    : id_string_(""), id_uint_(id_uint), id_type_(CorrelationIdDataType::UINT64)
+{
+}
+
+CorrelationId::CorrelationId(const CorrelationId& rhs)
+{
+  id_uint_ = rhs.id_uint_;
+  id_type_ = rhs.id_type_;
+  id_string_ = rhs.id_string_;
+}
+
+CorrelationId::CorrelationId(std::unique_ptr<CorrelationId>& correlation_id_shm)
+{
+  id_uint_ = correlation_id_shm->id_uint_;
+  id_type_ = correlation_id_shm->id_type_;
+  id_string_ = correlation_id_shm->id_string_;
+}
+
+CorrelationId&
+CorrelationId::operator=(const CorrelationId& rhs)
+{
+  id_uint_ = rhs.id_uint_;
+  id_type_ = rhs.id_type_;
+  id_string_ = rhs.id_string_;
+  return *this;
+}
+
+void
+CorrelationId::SaveToSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  AllocatedSharedMemory<CorrelationIdShm> correlation_id_shm =
+      shm_pool->Construct<CorrelationIdShm>();
+  correlation_id_shm_ptr_ = correlation_id_shm.data_.get();
+
+  std::unique_ptr<PbString> id_string_shm =
+      PbString::Create(shm_pool, id_string_);
+
+  correlation_id_shm_ptr_->id_uint = id_uint_;
+  correlation_id_shm_ptr_->id_string_shm_handle = id_string_shm->ShmHandle();
+  correlation_id_shm_ptr_->id_type = id_type_;
+
+  // Save the references to shared memory.
+  correlation_id_shm_ = std::move(correlation_id_shm);
+  id_string_shm_ = std::move(id_string_shm);
+  shm_handle_ = correlation_id_shm_.handle_;
+}
+
+std::unique_ptr<CorrelationId>
+CorrelationId::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle)
+{
+  AllocatedSharedMemory<CorrelationIdShm> correlation_id_shm =
+      shm_pool->Load<CorrelationIdShm>(handle);
+  CorrelationIdShm* correlation_id_shm_ptr = correlation_id_shm.data_.get();
+
+  std::unique_ptr<PbString> id_string_shm = PbString::LoadFromSharedMemory(
+      shm_pool, correlation_id_shm_ptr->id_string_shm_handle);
+
+  return std::unique_ptr<CorrelationId>(
+      new CorrelationId(correlation_id_shm, id_string_shm));
+}
+
+CorrelationId::CorrelationId(
+    AllocatedSharedMemory<CorrelationIdShm>& correlation_id_shm,
+    std::unique_ptr<PbString>& id_string_shm)
+    : correlation_id_shm_(std::move(correlation_id_shm)),
+      id_string_shm_(std::move(id_string_shm))
+{
+  correlation_id_shm_ptr_ = correlation_id_shm_.data_.get();
+  shm_handle_ = correlation_id_shm_.handle_;
+  id_string_ = id_string_shm_->String();
+  id_uint_ = correlation_id_shm_ptr_->id_uint;
+  id_type_ = correlation_id_shm_ptr_->id_type;
+}
+
+}}};  // namespace triton::backend::python
diff --git a/src/correlation_id.h b/src/correlation_id.h
new file mode 100644
index 00000000..63185d9f
--- /dev/null
+++ b/src/correlation_id.h
@@ -0,0 +1,93 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <string>
+
+#include "pb_string.h"
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+
+enum class CorrelationIdDataType { UINT64, STRING };
+
+struct CorrelationIdShm {
+  bi::managed_external_buffer::handle_t id_string_shm_handle;
+  uint64_t id_uint;
+  CorrelationIdDataType id_type;
+};
+
+class CorrelationId {
+ public:
+  CorrelationId();
+  CorrelationId(const std::string& id_string);
+  CorrelationId(uint64_t id_uint);
+  CorrelationId(const CorrelationId& rhs);
+  CorrelationId(std::unique_ptr<CorrelationId>& correlation_id_shm);
+  CorrelationId& operator=(const CorrelationId& rhs);
+
+  /// Save CorrelationId object to shared memory.
+  /// \param shm_pool Shared memory pool to save the CorrelationId object.
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+
+  /// Create a CorrelationId object from shared memory.
+  /// \param shm_pool Shared memory pool
+  /// \param handle Shared memory handle of the CorrelationId.
+  /// \return Returns the CorrelationId in the specified handle
+  /// location.
+  static std::unique_ptr<CorrelationId> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+
+  // Function that help determine exact type of Correlation Id
+  CorrelationIdDataType Type() const { return id_type_; }
+
+  // Get the value of the CorrelationId based on the type
+  const std::string& StringValue() const { return id_string_; }
+  uint64_t UnsignedIntValue() const { return id_uint_; }
+
+  bi::managed_external_buffer::handle_t ShmHandle() { return shm_handle_; }
+
+ private:
+  // The private constructor for creating a CorrelationId object from shared
+  // memory.
+  CorrelationId(
+      AllocatedSharedMemory<CorrelationIdShm>& correlation_id_shm,
+      std::unique_ptr<PbString>& id_string_shm);
+
+  std::string id_string_;
+  uint64_t id_uint_;
+  CorrelationIdDataType id_type_;
+
+  // Shared Memory Data Structures
+  AllocatedSharedMemory<CorrelationIdShm> correlation_id_shm_;
+  CorrelationIdShm* correlation_id_shm_ptr_;
+  bi::managed_external_buffer::handle_t shm_handle_;
+  std::unique_ptr<PbString> id_string_shm_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/gpu_buffers.cc b/src/gpu_buffers.cc
new file mode 100644
index 00000000..4b1b0f9f
--- /dev/null
+++ b/src/gpu_buffers.cc
@@ -0,0 +1,89 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gpu_buffers.h"
+
+#include "pb_string.h"
+
+namespace triton { namespace backend { namespace python {
+GPUBuffersHelper::GPUBuffersHelper()
+{
+  completed_ = false;
+}
+
+void
+GPUBuffersHelper::AddBuffer(const bi::managed_external_buffer::handle_t& handle)
+{
+  if (completed_) {
+    throw PythonBackendException(
+        "It is not possible to add buffers after 'Complete' has been called on "
+        "a GPUBuffersHelper.");
+  }
+
+  buffers_.emplace_back(handle);
+}
+
+void
+GPUBuffersHelper::SetError(
+    std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& error)
+{
+  error_shm_ = PbString::Create(shm_pool, error);
+}
+
+void
+GPUBuffersHelper::Complete(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  if (completed_) {
+    throw PythonBackendException(
+        "Complete has already been called. Complete should only be called "
+        "once.");
+  }
+  gpu_buffers_shm_ = shm_pool->Construct<GPUBuffersShm>();
+  if (!error_shm_) {
+    buffers_handle_shm_ =
+        shm_pool->Construct<bi::managed_external_buffer::handle_t>(
+            buffers_.size());
+    gpu_buffers_shm_.data_->buffer_count = buffers_.size();
+    gpu_buffers_shm_.data_->success = true;
+    gpu_buffers_shm_.data_->buffers = buffers_handle_shm_.handle_;
+    for (size_t i = 0; i < buffers_.size(); ++i) {
+      buffers_handle_shm_.data_.get()[i] = buffers_[i];
+    }
+  } else {
+    gpu_buffers_shm_.data_->success = false;
+    gpu_buffers_shm_.data_->error = error_shm_->ShmHandle();
+  }
+  completed_ = true;
+}
+
+
+bi::managed_external_buffer::handle_t
+GPUBuffersHelper::ShmHandle()
+{
+  return gpu_buffers_shm_.handle_;
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/gpu_buffers.h b/src/gpu_buffers.h
new file mode 100644
index 00000000..fd683ba7
--- /dev/null
+++ b/src/gpu_buffers.h
@@ -0,0 +1,67 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "pb_string.h"
+#include "pb_utils.h"
+#include "scoped_defer.h"
+
+namespace triton { namespace backend { namespace python {
+
+/// \param success indicating whether the process of fetching the GPU buffers
+/// was successful.
+/// \param error if success is equal to false, the error object will be set.
+/// \param buffers list of buffers elements.
+/// \param buffer_count the number of buffers.
+struct GPUBuffersShm {
+  bool success;
+  bi::managed_external_buffer::handle_t error;
+  bi::managed_external_buffer::handle_t buffers;
+  uint32_t buffer_count;
+};
+
+/// Helper class to facilitate transfer of metadata associated
+/// the GPU buffers in shared memory.
+class GPUBuffersHelper {
+ public:
+  GPUBuffersHelper();
+  void AddBuffer(const bi::managed_external_buffer::handle_t& handle);
+  void Complete(std::unique_ptr<SharedMemoryManager>& shm_pool);
+  void SetError(
+      std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& error);
+  bi::managed_external_buffer::handle_t ShmHandle();
+
+ private:
+  AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm_;
+  std::vector<bi::managed_external_buffer::handle_t> buffers_;
+  AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
+      buffers_handle_shm_;
+  std::unique_ptr<PbString> error_shm_;
+  bool completed_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/infer_payload.cc b/src/infer_payload.cc
new file mode 100644
index 00000000..6baad307
--- /dev/null
+++ b/src/infer_payload.cc
@@ -0,0 +1,122 @@
+// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "infer_payload.h"
+
+namespace triton { namespace backend { namespace python {
+
+InferPayload::InferPayload(
+    const bool is_decoupled,
+    std::function<void(std::unique_ptr<InferResponse>)> callback)
+    : is_decoupled_(is_decoupled), is_promise_set_(false), callback_(callback),
+      request_address_(reinterpret_cast<intptr_t>(nullptr))
+{
+  promise_.reset(new std::promise<std::unique_ptr<InferResponse>>());
+}
+
+void
+InferPayload::SetValue(std::unique_ptr<InferResponse> infer_response)
+{
+  {
+    // Only set value to the promise with the first response. Call the callback
+    // function to send decoupled response to the stub.
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!is_promise_set_) {
+      is_promise_set_ = true;
+      promise_->set_value(std::move(infer_response));
+      return;
+    }
+  }
+  Callback(std::move(infer_response));
+}
+
+void
+InferPayload::SetFuture(
+    std::future<std::unique_ptr<InferResponse>>& response_future)
+{
+  response_future = promise_->get_future();
+}
+
+bool
+InferPayload::IsDecoupled()
+{
+  return is_decoupled_;
+}
+
+bool
+InferPayload::IsPromiseSet()
+{
+  return is_promise_set_;
+}
+
+void
+InferPayload::Callback(std::unique_ptr<InferResponse> infer_response)
+{
+  return callback_(std::move(infer_response));
+}
+
+void
+InferPayload::SetResponseAllocUserp(
+    const ResponseAllocatorUserp& response_alloc_userp)
+{
+  response_alloc_userp_ =
+      std::make_shared<ResponseAllocatorUserp>(response_alloc_userp);
+}
+
+std::shared_ptr<ResponseAllocatorUserp>
+InferPayload::ResponseAllocUserp()
+{
+  return response_alloc_userp_;
+}
+
+void
+InferPayload::SetRequestAddress(intptr_t request_address)
+{
+  std::unique_lock<std::mutex> lock(request_address_mutex_);
+  request_address_ = request_address;
+}
+
+void
+InferPayload::SetRequestCancellationFunc(
+    const std::function<void(intptr_t)>& request_cancel_func)
+{
+  request_cancel_func_ = request_cancel_func;
+}
+
+void
+InferPayload::SafeCancelRequest()
+{
+  std::unique_lock<std::mutex> lock(request_address_mutex_);
+  if (request_address_ == 0L) {
+    return;
+  }
+
+  if (request_cancel_func_) {
+    request_cancel_func_(request_address_);
+  }
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/infer_payload.h b/src/infer_payload.h
new file mode 100644
index 00000000..8e4aa7d3
--- /dev/null
+++ b/src/infer_payload.h
@@ -0,0 +1,82 @@
+// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <functional>
+#include <queue>
+
+#include "infer_response.h"
+#include "pb_preferred_memory.h"
+
+namespace triton { namespace backend { namespace python {
+
+struct ResponseAllocatorUserp {
+  ResponseAllocatorUserp(
+      void* shm_pool, const PreferredMemory& preferred_memory)
+      : shm_pool(shm_pool), preferred_memory(preferred_memory)
+  {
+  }
+  void* shm_pool;
+  PreferredMemory preferred_memory;
+};
+
+class InferPayload : public std::enable_shared_from_this<InferPayload> {
+ public:
+  InferPayload(
+      const bool is_decouple,
+      std::function<void(std::unique_ptr<InferResponse>)> callback);
+
+  /// GetPtr should be only called when the InferPayload object is constructed
+  /// using a shared pointer. Calling this function in any other circumstance
+  /// is undefined behaviour until C++17.
+  std::shared_ptr<InferPayload> GetPtr() { return shared_from_this(); }
+  void SetValue(std::unique_ptr<InferResponse> infer_response);
+  void SetFuture(std::future<std::unique_ptr<InferResponse>>& response_future);
+  bool IsDecoupled();
+  bool IsPromiseSet();
+  void Callback(std::unique_ptr<InferResponse> infer_response);
+  void SetResponseAllocUserp(
+      const ResponseAllocatorUserp& response_alloc_userp);
+  std::shared_ptr<ResponseAllocatorUserp> ResponseAllocUserp();
+  void SetRequestAddress(intptr_t request_address);
+  void SetRequestCancellationFunc(
+      const std::function<void(intptr_t)>& request_cancel_func);
+  void SafeCancelRequest();
+
+ private:
+  std::unique_ptr<std::promise<std::unique_ptr<InferResponse>>> promise_;
+  bool is_decoupled_;
+  std::mutex mutex_;
+  bool is_promise_set_;
+  std::function<void(std::unique_ptr<InferResponse>)> callback_;
+  std::shared_ptr<ResponseAllocatorUserp> response_alloc_userp_;
+  std::mutex request_address_mutex_;
+  intptr_t request_address_;
+  std::function<void(intptr_t)> request_cancel_func_;
+};
+
+}}}  // namespace triton::backend::python
diff --git a/src/infer_request.cc b/src/infer_request.cc
new file mode 100644
index 00000000..e5733662
--- /dev/null
+++ b/src/infer_request.cc
@@ -0,0 +1,610 @@
+// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "infer_request.h"
+
+#include <boost/interprocess/sync/scoped_lock.hpp>
+
+#include "gpu_buffers.h"
+#include "pb_utils.h"
+#include "scoped_defer.h"
+#ifdef TRITON_PB_STUB
+#include "pb_stub.h"
+#endif
+
+namespace triton { namespace backend { namespace python {
+
+InferRequest::InferRequest(
+    const std::string& request_id, const CorrelationId& correlation_id,
+    const std::vector<std::shared_ptr<PbTensor>>& inputs,
+    const std::set<std::string>& requested_output_names,
+    const std::string& model_name, const int64_t model_version,
+    const std::string& parameters, const uint32_t flags, const uint64_t timeout,
+    const intptr_t response_factory_address, const intptr_t request_address,
+    const PreferredMemory& preferred_memory, const InferenceTrace& trace)
+    : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs),
+      requested_output_names_(requested_output_names), model_name_(model_name),
+      model_version_(model_version), parameters_(parameters), flags_(flags),
+      timeout_(timeout), response_factory_address_(response_factory_address),
+      request_address_(request_address), preferred_memory_(preferred_memory),
+      trace_(trace), request_release_flags_(TRITONSERVER_REQUEST_RELEASE_ALL)
+{
+  for (auto& input : inputs) {
+    if (!input) {
+      throw PythonBackendException(
+          "Input tensor for request with id '" + request_id +
+          "' and model name '" + model_name + "' should not be empty.");
+    }
+  }
+
+  for (auto& requested_output_name : requested_output_names) {
+    if (requested_output_name == "") {
+      throw PythonBackendException(
+          "Requested output name for request with id '" + request_id +
+          "' and model name '" + model_name + "' should not be empty.");
+    }
+  }
+
+#ifdef TRITON_PB_STUB
+  pb_cancel_ =
+      std::make_shared<PbCancel>(response_factory_address_, request_address_);
+  response_sender_ = std::make_shared<ResponseSender>(
+      request_address_, response_factory_address_, nullptr /* is_decoupled */,
+      RequestedOutputNames(), Stub::GetOrCreateInstance()->SharedMemory(),
+      pb_cancel_);
+#endif
+}
+
+const std::vector<std::shared_ptr<PbTensor>>&
+InferRequest::Inputs()
+{
+  return inputs_;
+}
+
+const std::string&
+InferRequest::Parameters()
+{
+  return parameters_;
+}
+
+const std::string&
+InferRequest::RequestId()
+{
+  return request_id_;
+}
+
+CorrelationId&
+InferRequest::GetCorrelationId()
+{
+  return correlation_id_;
+}
+
+const std::set<std::string>&
+InferRequest::RequestedOutputNames()
+{
+  return requested_output_names_;
+}
+
+const std::string&
+InferRequest::ModelName()
+{
+  return model_name_;
+}
+
+int64_t
+InferRequest::ModelVersion()
+{
+  return model_version_;
+}
+
+uint32_t
+InferRequest::Flags()
+{
+  return flags_;
+}
+
+intptr_t
+InferRequest::RequestAddress()
+{
+  return request_address_;
+}
+
+void
+InferRequest::SetFlags(uint32_t flags)
+{
+  flags_ = flags;
+}
+
+bi::managed_external_buffer::handle_t
+InferRequest::ShmHandle()
+{
+  return shm_handle_;
+}
+
+uint64_t
+InferRequest::Timeout()
+{
+  return timeout_;
+}
+
+void
+InferRequest::SetIsDecoupled(const bool is_decoupled)
+{
+  is_decoupled_ = is_decoupled;
+}
+
+bool
+InferRequest::IsDecoupled()
+{
+  return is_decoupled_;
+}
+
+PreferredMemory&
+InferRequest::GetPreferredMemory()
+{
+  return preferred_memory_;
+}
+
+InferenceTrace&
+InferRequest::GetTrace()
+{
+  return trace_;
+}
+
+uint32_t
+InferRequest::ReleaseFlags()
+{
+  request_release_flags_ = infer_request_shm_ptr_->request_release_flags;
+  return request_release_flags_;
+}
+
+void
+InferRequest::SetReleaseFlags(const uint32_t& flags)
+{
+  request_release_flags_ = flags;
+  infer_request_shm_ptr_->request_release_flags = request_release_flags_;
+}
+
+void
+InferRequest::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  AllocatedSharedMemory<char> infer_request_shm = shm_pool->Construct<char>(
+      sizeof(InferRequestShm) +
+      (RequestedOutputNames().size() *
+       sizeof(bi::managed_external_buffer::handle_t)) +
+      (Inputs().size() * sizeof(bi::managed_external_buffer::handle_t)));
+
+  infer_request_shm_ptr_ =
+      reinterpret_cast<InferRequestShm*>(infer_request_shm.data_.get());
+  infer_request_shm_ptr_->input_count = Inputs().size();
+  infer_request_shm_ptr_->model_version = model_version_;
+  infer_request_shm_ptr_->requested_output_count =
+      RequestedOutputNames().size();
+  infer_request_shm_ptr_->flags = Flags();
+  infer_request_shm_ptr_->address = request_address_;
+  infer_request_shm_ptr_->response_factory_address = response_factory_address_;
+  infer_request_shm_ptr_->is_decoupled = is_decoupled_;
+  infer_request_shm_ptr_->timeout = timeout_;
+  infer_request_shm_ptr_->preferred_memory = preferred_memory_;
+  infer_request_shm_ptr_->request_release_flags = request_release_flags_;
+
+  output_names_handle_shm_ptr_ =
+      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+          reinterpret_cast<char*>(infer_request_shm_ptr_) +
+          sizeof(InferRequestShm));
+
+  // [FIXME] This could also be a part of the single allocated memory for this
+  // object.
+  size_t i = 0;
+  std::vector<std::unique_ptr<PbString>> requested_output_names_shm;
+  for (auto& requested_output_name : requested_output_names_) {
+    std::unique_ptr<PbString> requested_output_name_shm =
+        PbString::Create(shm_pool, requested_output_name);
+    output_names_handle_shm_ptr_[i] = requested_output_name_shm->ShmHandle();
+    requested_output_names_shm.emplace_back(
+        std::move(requested_output_name_shm));
+    i++;
+  }
+
+  input_tensors_handle_ptr_ =
+      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+          reinterpret_cast<char*>(output_names_handle_shm_ptr_) +
+          sizeof(bi::managed_external_buffer::handle_t) *
+              RequestedOutputNames().size());
+  i = 0;
+  for (auto& input : Inputs()) {
+    input_tensors_handle_ptr_[i] = input->ShmHandle();
+    i++;
+  }
+
+  correlation_id_.SaveToSharedMemory(shm_pool);
+  infer_request_shm_ptr_->correlation_id_shm_handle =
+      correlation_id_.ShmHandle();
+
+  std::unique_ptr<PbString> model_name_shm =
+      PbString::Create(shm_pool, ModelName());
+  infer_request_shm_ptr_->model_name_shm_handle = model_name_shm->ShmHandle();
+
+  std::unique_ptr<PbString> request_id_shm =
+      PbString::Create(shm_pool, RequestId());
+  infer_request_shm_ptr_->request_id_shm_handle = request_id_shm->ShmHandle();
+
+  std::unique_ptr<PbString> parameters_shm =
+      PbString::Create(shm_pool, Parameters());
+  infer_request_shm_ptr_->parameters_shm_handle = parameters_shm->ShmHandle();
+
+  trace_.SaveToSharedMemory(shm_pool);
+  infer_request_shm_ptr_->trace_shm_handle = trace_.ShmHandle();
+
+  // Save the references to shared memory.
+  infer_request_shm_ = std::move(infer_request_shm);
+  request_id_shm_ = std::move(request_id_shm);
+  model_name_shm_ = std::move(model_name_shm);
+  parameters_shm_ = std::move(parameters_shm);
+  shm_handle_ = infer_request_shm_.handle_;
+  requested_output_names_shm_ = std::move(requested_output_names_shm);
+}
+
+std::unique_ptr<InferRequest>
+InferRequest::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle,
+    bool const* is_model_decoupled)
+{
+  AllocatedSharedMemory<char> infer_request_shm =
+      shm_pool->Load<char>(request_handle);
+  InferRequestShm* infer_request_shm_ptr =
+      reinterpret_cast<InferRequestShm*>(infer_request_shm.data_.get());
+
+  std::vector<std::unique_ptr<PbString>> requested_output_names_shm;
+  uint32_t requested_output_count =
+      infer_request_shm_ptr->requested_output_count;
+
+  bi::managed_external_buffer::handle_t* output_names_handle_shm_ptr =
+      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+          (reinterpret_cast<char*>(infer_request_shm_ptr) +
+           sizeof(InferRequestShm)));
+
+  for (size_t output_idx = 0; output_idx < requested_output_count;
+       ++output_idx) {
+    std::unique_ptr<PbString> pb_string = PbString::LoadFromSharedMemory(
+        shm_pool, output_names_handle_shm_ptr[output_idx]);
+    requested_output_names_shm.emplace_back(std::move(pb_string));
+  }
+
+  bi::managed_external_buffer::handle_t* input_names_handle_shm_ptr =
+      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+          (reinterpret_cast<char*>(infer_request_shm_ptr) +
+           sizeof(InferRequestShm) +
+           (infer_request_shm_ptr->requested_output_count *
+            sizeof(bi::managed_external_buffer::handle_t))));
+
+  std::vector<std::shared_ptr<PbTensor>> input_tensors;
+  for (size_t input_idx = 0; input_idx < infer_request_shm_ptr->input_count;
+       ++input_idx) {
+    std::shared_ptr<PbTensor> input_tensor = PbTensor::LoadFromSharedMemory(
+        shm_pool, input_names_handle_shm_ptr[input_idx], open_cuda_handle);
+    input_tensors.emplace_back(std::move(input_tensor));
+  }
+
+  std::unique_ptr<CorrelationId> correlation_id_shm =
+      CorrelationId::LoadFromSharedMemory(
+          shm_pool, infer_request_shm_ptr->correlation_id_shm_handle);
+
+  std::unique_ptr<InferenceTrace> infer_trace_shm =
+      InferenceTrace::LoadFromSharedMemory(
+          shm_pool, infer_request_shm_ptr->trace_shm_handle);
+
+  std::unique_ptr<PbString> model_name_shm = PbString::LoadFromSharedMemory(
+      shm_pool, infer_request_shm_ptr->model_name_shm_handle);
+  std::unique_ptr<PbString> request_id_shm = PbString::LoadFromSharedMemory(
+      shm_pool, infer_request_shm_ptr->request_id_shm_handle);
+  std::unique_ptr<PbString> parameters_shm = PbString::LoadFromSharedMemory(
+      shm_pool, infer_request_shm_ptr->parameters_shm_handle);
+
+  return std::unique_ptr<InferRequest>(new InferRequest(
+      infer_request_shm, request_id_shm, correlation_id_shm,
+      requested_output_names_shm, model_name_shm, input_tensors, parameters_shm,
+      infer_trace_shm, is_model_decoupled));
+}
+
+InferRequest::InferRequest(
+    AllocatedSharedMemory<char>& infer_request_shm,
+    std::unique_ptr<PbString>& request_id_shm,
+    std::unique_ptr<CorrelationId>& correlation_id_shm,
+    std::vector<std::unique_ptr<PbString>>& requested_output_names_shm,
+    std::unique_ptr<PbString>& model_name_shm,
+    std::vector<std::shared_ptr<PbTensor>>& input_tensors,
+    std::unique_ptr<PbString>& parameters_shm,
+    std::unique_ptr<InferenceTrace>& infer_trace_shm,
+    bool const* is_model_decoupled)
+    : infer_request_shm_(std::move(infer_request_shm)),
+      request_id_shm_(std::move(request_id_shm)),
+      requested_output_names_shm_(std::move(requested_output_names_shm)),
+      model_name_shm_(std::move(model_name_shm)),
+      parameters_shm_(std::move(parameters_shm))
+{
+  infer_request_shm_ptr_ =
+      reinterpret_cast<InferRequestShm*>(infer_request_shm_.data_.get());
+  output_names_handle_shm_ptr_ =
+      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+          reinterpret_cast<char*>(infer_request_shm_ptr_) +
+          sizeof(InferRequestShm));
+  input_tensors_handle_ptr_ =
+      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+          reinterpret_cast<char*>(infer_request_shm_ptr_) +
+          sizeof(InferRequestShm) +
+          sizeof(bi::managed_external_buffer::handle_t) *
+              infer_request_shm_ptr_->requested_output_count);
+  inputs_ = std::move(input_tensors);
+
+  std::set<std::string> requested_output_names;
+  for (size_t output_idx = 0;
+       output_idx < infer_request_shm_ptr_->requested_output_count;
+       ++output_idx) {
+    auto& pb_string = requested_output_names_shm_[output_idx];
+    requested_output_names.emplace(pb_string->String());
+  }
+
+  correlation_id_ = CorrelationId(correlation_id_shm);
+  request_id_ = request_id_shm_->String();
+  parameters_ = parameters_shm_->String();
+  requested_output_names_ = std::move(requested_output_names);
+  model_name_ = model_name_shm_->String();
+  flags_ = infer_request_shm_ptr_->flags;
+  model_version_ = infer_request_shm_ptr_->model_version;
+  request_address_ = infer_request_shm_ptr_->address;
+  response_factory_address_ = infer_request_shm_ptr_->response_factory_address;
+  is_decoupled_ = infer_request_shm_ptr_->is_decoupled;
+  timeout_ = infer_request_shm_ptr_->timeout;
+  preferred_memory_ = infer_request_shm_ptr_->preferred_memory;
+  trace_ = InferenceTrace(infer_trace_shm);
+  request_release_flags_ = infer_request_shm_ptr_->request_release_flags;
+
+#ifdef TRITON_PB_STUB
+  pb_cancel_ =
+      std::make_shared<PbCancel>(response_factory_address_, request_address_);
+  response_sender_ = std::make_shared<ResponseSender>(
+      request_address_, response_factory_address_, is_model_decoupled,
+      RequestedOutputNames(), Stub::GetOrCreateInstance()->SharedMemory(),
+      pb_cancel_);
+#endif
+}
+
+#ifdef TRITON_PB_STUB
+bool
+InferRequest::IsCancelled()
+{
+  return pb_cancel_->IsCancelled();
+}
+
+std::shared_ptr<ResponseSender>
+InferRequest::GetResponseSender()
+{
+  return response_sender_;
+}
+
+std::shared_ptr<InferResponse>
+InferRequest::Exec(const bool is_decoupled)
+{
+  // Release the GIL. This avoids a potential deadlock situation in the parent
+  // process, where every thread in the thread pool is indirectly waiting for a
+  // function in the stub process that acquires the GIL. Meanwhile, the current
+  // thread, which holds the GIL, is also waiting for the parent side to have
+  // the next available thread to pick up the job during resource contention.
+  py::gil_scoped_release release;
+
+  // BLS should not be used in "initialize" or "finalize" function.
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  if (!stub->IsInitialized() || stub->IsFinalizing()) {
+    throw PythonBackendException(
+        "BLS is only supported during the 'execute' function.");
+  }
+
+  ResponseBatch* response_batch = nullptr;
+  bool responses_is_set = false;
+  std::unique_ptr<SharedMemoryManager>& shm_pool = stub->SharedMemory();
+  bi::managed_external_buffer::handle_t* response_handle = nullptr;
+
+  PythonBackendException pb_exception(std::string{});
+  std::unique_ptr<IPCMessage> ipc_message;
+
+  AllocatedSharedMemory<char> request_batch;
+  ScopedDefer data_load_complete([&ipc_message] {
+    bi::scoped_lock<bi::interprocess_mutex> lock{
+        *(ipc_message->ResponseMutex())};
+    ipc_message->ResponseCondition()->notify_all();
+  });
+
+  try {
+    ipc_message = IPCMessage::Create(shm_pool, true /* inline_response */);
+    bool has_exception = false;
+    PythonBackendException pb_exception(std::string{});
+
+    if (is_decoupled) {
+      ipc_message->Command() =
+          PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest;
+    } else {
+      ipc_message->Command() =
+          PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest;
+    }
+
+    request_batch = shm_pool->Construct<char>(
+        sizeof(RequestBatch) + sizeof(bi::managed_external_buffer::handle_t));
+
+    RequestBatch* request_batch_shm_ptr =
+        reinterpret_cast<RequestBatch*>(request_batch.data_.get());
+    request_batch_shm_ptr->batch_size = 1;
+    ipc_message->Args() = request_batch.handle_;
+
+    bi::managed_external_buffer::handle_t* requests_shm =
+        reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+            request_batch.data_.get() + sizeof(RequestBatch));
+    request_batch_shm_ptr->batch_size = 1;
+
+    bool has_gpu_tensor = false;
+    size_t i = 0;
+    for (auto& input_tensor : inputs_) {
+      input_tensor->SaveToSharedMemory(shm_pool, false /* copy_gpu */);
+      if (!input_tensor->IsCPU()) {
+        has_gpu_tensor = true;
+      }
+      ++i;
+    }
+
+    SaveToSharedMemory(shm_pool);
+
+    // Save the shared memory offset of the request.
+    *requests_shm = ShmHandle();
+
+    // Send the BLS request to the parent process and wait for the response.
+    {
+      bi::scoped_lock<bi::interprocess_mutex> lock{
+          *(ipc_message->ResponseMutex())};
+      stub->SendIPCUtilsMessage(ipc_message);
+      ipc_message->ResponseCondition()->wait(lock);
+    }
+
+    // Additional round trip required for asking the stub process
+    // to fill in the GPU tensor buffers
+    if (has_gpu_tensor) {
+      AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm =
+          shm_pool->Load<GPUBuffersShm>(
+              request_batch_shm_ptr->gpu_buffers_handle);
+      AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
+          gpu_buffers_handle =
+              shm_pool->Load<bi::managed_external_buffer::handle_t>(
+                  gpu_buffers_shm.data_->buffers);
+      try {
+        if (!gpu_buffers_shm.data_->success) {
+          std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
+              shm_pool, gpu_buffers_shm.data_->error);
+          throw PythonBackendException(error->String());
+        }
+#ifdef TRITON_ENABLE_GPU
+        size_t i = 0;
+        for (auto& input_tensor : this->Inputs()) {
+          if (!input_tensor->IsCPU()) {
+            std::unique_ptr<PbMemory> dst_buffer =
+                PbMemory::LoadFromSharedMemory(
+                    shm_pool, (gpu_buffers_handle.data_.get())[i],
+                    true /* open cuda handle */);
+            PbMemory::CopyBuffer(dst_buffer, input_tensor->Memory());
+            ++i;
+          }
+        }
+#endif  // TRITON_ENABLE_GPU
+      }
+      catch (const PythonBackendException& exception) {
+        // We need to catch the exception here. Otherwise, we will not notify
+        // the main process and it will wait for the response forever.
+        pb_exception = exception;
+        has_exception = true;
+      }
+
+      {
+        bi::scoped_lock<bi::interprocess_mutex> lock{
+            *(ipc_message->ResponseMutex())};
+        ipc_message->ResponseCondition()->notify_all();
+        ipc_message->ResponseCondition()->wait(lock);
+      }
+    }
+
+    // The exception will be thrown after the message was sent to the main
+    // process.
+    if (has_exception) {
+      throw pb_exception;
+    }
+
+    // Get the response for the current message.
+    std::unique_ptr<IPCMessage> bls_response = IPCMessage::LoadFromSharedMemory(
+        shm_pool, ipc_message->ResponseHandle());
+
+    AllocatedSharedMemory<char> response_batch_shm =
+        shm_pool->Load<char>(bls_response->Args());
+    response_batch =
+        reinterpret_cast<ResponseBatch*>(response_batch_shm.data_.get());
+    response_handle = reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+        response_batch_shm.data_.get() + sizeof(ResponseBatch));
+
+    responses_is_set = true;
+    if (response_batch->has_error) {
+      if (response_batch->is_error_set) {
+        std::unique_ptr<PbString> pb_string =
+            PbString::LoadFromSharedMemory(shm_pool, response_batch->error);
+        auto error_response = std::make_unique<InferResponse>(
+            std::vector<std::shared_ptr<PbTensor>>{},
+            std::make_shared<PbError>(pb_string->String()));
+
+        return error_response;
+      } else {
+        auto error_response = std::make_unique<InferResponse>(
+            std::vector<std::shared_ptr<PbTensor>>{},
+            std::make_shared<PbError>(
+                "An error occurred while performing BLS request."));
+
+        return error_response;
+      }
+    }
+  }
+  catch (const PythonBackendException& pb_exception) {
+    auto error_response = std::make_unique<InferResponse>(
+        std::vector<std::shared_ptr<PbTensor>>{},
+        std::make_shared<PbError>(pb_exception.what()));
+
+    return error_response;
+  }
+
+  if (responses_is_set) {
+    auto& memory_manager_message_queue = stub->MemoryManagerQueue();
+    std::unique_ptr<InferResponse> return_response =
+        InferResponse::LoadFromSharedMemory(
+            shm_pool, *response_handle, true /* open cuda handle */);
+
+    for (auto& output_tensor : return_response->OutputTensors()) {
+      if (!output_tensor->IsCPU()) {
+        uint64_t memory_release_id = output_tensor->Memory()->MemoryReleaseId();
+        output_tensor->Memory()->SetMemoryReleaseCallback(
+            [&memory_manager_message_queue, memory_release_id, &shm_pool]() {
+              memory_manager_message_queue->Push(memory_release_id);
+            });
+      }
+    }
+
+    return return_response;
+  } else {
+    auto error_response = std::make_unique<InferResponse>(
+        std::vector<std::shared_ptr<PbTensor>>{},
+        std::make_shared<PbError>(
+            "An error occurred while performing BLS request."));
+
+    return error_response;
+  }
+}
+
+#endif
+
+}}}  // namespace triton::backend::python
diff --git a/src/infer_request.h b/src/infer_request.h
new file mode 100644
index 00000000..f368d692
--- /dev/null
+++ b/src/infer_request.h
@@ -0,0 +1,175 @@
+// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <future>
+#include <string>
+
+#include "correlation_id.h"
+#include "infer_response.h"
+#include "infer_trace.h"
+#include "pb_preferred_memory.h"
+#include "pb_tensor.h"
+
+#ifdef TRITON_PB_STUB
+#include "pb_cancel.h"
+#include "response_sender.h"
+#endif
+
+namespace triton { namespace backend { namespace python {
+
+class Stub;
+
+//
+// Inference Request
+//
+struct InferRequestShm {
+  uint32_t input_count;
+  uint32_t requested_output_count;
+  int64_t model_version;
+  uint32_t flags;
+  intptr_t address;
+  intptr_t response_factory_address;
+  bool is_decoupled;
+  uint64_t timeout;
+  PreferredMemory preferred_memory;
+  bi::managed_external_buffer::handle_t trace_shm_handle;
+  uint32_t request_release_flags;
+  bi::managed_external_buffer::handle_t correlation_id_shm_handle;
+  bi::managed_external_buffer::handle_t model_name_shm_handle;
+  bi::managed_external_buffer::handle_t request_id_shm_handle;
+  bi::managed_external_buffer::handle_t parameters_shm_handle;
+};
+
+class InferRequest {
+ public:
+  InferRequest(
+      const std::string& request_id, const CorrelationId& correlation_id,
+      const std::vector<std::shared_ptr<PbTensor>>& inputs,
+      const std::set<std::string>& requested_output_names,
+      const std::string& model_name, const int64_t model_version,
+      const std::string& parameters, const uint32_t flags = 0,
+      const uint64_t timeout = 0, const intptr_t response_factory_address = 0,
+      const intptr_t request_address = 0,
+      const PreferredMemory& preferred_memory =
+          PreferredMemory(PreferredMemory::kDefault, 0),
+      const InferenceTrace& trace = InferenceTrace());
+
+  const std::vector<std::shared_ptr<PbTensor>>& Inputs();
+  const std::string& RequestId();
+  const std::string& Parameters();
+  CorrelationId& GetCorrelationId();
+  const std::string& ModelName();
+  int64_t ModelVersion();
+  uint32_t Flags();
+  void SetFlags(uint32_t flags);
+  const std::set<std::string>& RequestedOutputNames();
+  bi::managed_external_buffer::handle_t ShmHandle();
+  uint64_t Timeout();
+  bool IsDecoupled();
+  void SetIsDecoupled(const bool is_decoupled);
+  PreferredMemory& GetPreferredMemory();
+  InferenceTrace& GetTrace();
+  uint32_t ReleaseFlags();
+  void SetReleaseFlags(const uint32_t& flags);
+  intptr_t GetResponseFactoryAddress() { return response_factory_address_; }
+
+#ifdef TRITON_PB_STUB
+  std::shared_ptr<InferResponse> Exec(const bool is_decoupled);
+  std::shared_ptr<ResponseSender> GetResponseSender();
+  bool IsCancelled();
+#endif
+
+  /// Save an Inference Request to shared memory.
+  /// \param shm_pool Shared memory pool to save the inference request.
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+
+  /// Create an Inference Request object from shared memory.
+  /// \param shm_pool Shared memory pool
+  /// \param request_handle Shared memory handle of the request.
+  /// \param open_cuda_handle Determines if the tensor in the infer request
+  /// object is a GPU tensor, to call the cudaIpcOpenMemHandle to obtain the
+  /// tensor or not.
+  /// \return Returns the infer request in the specified request_handle
+  /// location.
+  static std::unique_ptr<InferRequest> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t request_handle,
+      bool open_cuda_handle, bool const* is_model_decoupled);
+
+  /// Disallow copying the inference request object.
+  DISALLOW_COPY_AND_ASSIGN(InferRequest);
+
+  intptr_t RequestAddress();
+  ~InferRequest() {}
+
+ private:
+  InferRequest(
+      AllocatedSharedMemory<char>& infer_request_shm,
+      std::unique_ptr<PbString>& request_id_shm,
+      std::unique_ptr<CorrelationId>& correlation_id,
+      std::vector<std::unique_ptr<PbString>>& requested_output_names_shm,
+      std::unique_ptr<PbString>& model_name_shm,
+      std::vector<std::shared_ptr<PbTensor>>& input_tensors,
+      std::unique_ptr<PbString>& parameters_shm,
+      std::unique_ptr<InferenceTrace>& infer_trace_shm,
+      bool const* is_model_decoupled);
+
+  std::string request_id_;
+  CorrelationId correlation_id_;
+  std::vector<std::shared_ptr<PbTensor>> inputs_;
+  std::set<std::string> requested_output_names_;
+  std::string model_name_;
+  int64_t model_version_;
+  std::string parameters_;
+  uint32_t flags_;
+  uint64_t timeout_;
+  intptr_t response_factory_address_;
+  intptr_t request_address_;
+  bool is_decoupled_;
+  PreferredMemory preferred_memory_;
+  InferenceTrace trace_;
+  uint32_t request_release_flags_;
+
+  // Shared Memory Data Structures
+  AllocatedSharedMemory<char> infer_request_shm_;
+  InferRequestShm* infer_request_shm_ptr_;
+
+  std::unique_ptr<PbString> request_id_shm_;
+  std::vector<std::unique_ptr<PbString>> requested_output_names_shm_;
+  std::unique_ptr<PbString> model_name_shm_;
+  bi::managed_external_buffer::handle_t* output_names_handle_shm_ptr_;
+  bi::managed_external_buffer::handle_t* input_tensors_handle_ptr_;
+  bi::managed_external_buffer::handle_t shm_handle_;
+  std::unique_ptr<PbString> parameters_shm_;
+
+#ifdef TRITON_PB_STUB
+  std::shared_ptr<PbCancel> pb_cancel_;
+  std::shared_ptr<ResponseSender> response_sender_;
+#endif
+};
+}}};  // namespace triton::backend::python
diff --git a/src/infer_response.cc b/src/infer_response.cc
new file mode 100644
index 00000000..382756d4
--- /dev/null
+++ b/src/infer_response.cc
@@ -0,0 +1,462 @@
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "infer_response.h"
+
+#ifdef TRITON_PB_STUB
+#include <pybind11/embed.h>
+namespace py = pybind11;
+#endif
+#include <algorithm>
+
+#include "scoped_defer.h"
+
+
+namespace triton { namespace backend { namespace python {
+
+InferResponse::InferResponse(
+    const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
+    std::shared_ptr<PbError> error, std::string parameters,
+    const bool is_last_response, void* id)
+    : error_(error), is_last_response_(is_last_response), id_(id),
+      parameters_(std::move(parameters))
+{
+  for (auto& output : output_tensors) {
+    if (!output) {
+      throw PythonBackendException(
+          "Output tensor for inference response should not be empty.");
+    }
+  }
+
+  output_tensors_ = output_tensors;
+}
+
+std::vector<std::shared_ptr<PbTensor>>&
+InferResponse::OutputTensors()
+{
+  return output_tensors_;
+}
+
+const std::string&
+InferResponse::Parameters() const
+{
+  return parameters_;
+}
+
+bool
+InferResponse::HasError()
+{
+  return error_.get() != nullptr;
+}
+
+void
+InferResponse::SaveToSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu)
+{
+  size_t output_tensor_length = output_tensors_.size();
+  if (HasError()) {
+    response_shm_ = shm_pool->Construct<char>(sizeof(ResponseShm));
+  } else {
+    response_shm_ = shm_pool->Construct<char>(
+        sizeof(ResponseShm) +
+        output_tensor_length * sizeof(bi::managed_external_buffer::handle_t));
+  }
+
+  ResponseShm* response_shm_ptr =
+      reinterpret_cast<ResponseShm*>(response_shm_.data_.get());
+  response_shm_ptr->has_error = false;
+  response_shm_ptr->is_error_set = false;
+  shm_handle_ = response_shm_.handle_;
+  response_shm_ptr->is_last_response = is_last_response_;
+  response_shm_ptr->id = id_;
+
+  // Only save the output tensors to shared memory when the inference response
+  // doesn't have error.
+  if (HasError()) {
+    response_shm_ptr->has_error = true;
+    Error()->SaveToSharedMemory(shm_pool);
+
+    response_shm_ptr->is_error_set = true;
+    response_shm_ptr->error = Error()->ShmHandle();
+    response_shm_ptr->outputs_size = 0;
+  } else {
+    bi::managed_external_buffer::handle_t* tensor_handle_shm_ptr =
+        reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+            response_shm_.data_.get() + sizeof(ResponseShm));
+    response_shm_ptr->outputs_size = output_tensor_length;
+
+    size_t j = 0;
+    for (auto& output_tensor : output_tensors_) {
+      output_tensor->SaveToSharedMemory(shm_pool, copy_gpu);
+      tensor_handle_shm_ptr[j] = output_tensor->ShmHandle();
+      j++;
+    }
+
+    parameters_shm_ = PbString::Create(shm_pool, parameters_);
+    response_shm_ptr->parameters = parameters_shm_->ShmHandle();
+  }
+}
+
+bi::managed_external_buffer::handle_t
+InferResponse::ShmHandle()
+{
+  return shm_handle_;
+}
+
+void
+InferResponse::PruneOutputTensors(
+    const std::set<std::string>& requested_output_names)
+{
+  for (auto it = output_tensors_.begin(); it != output_tensors_.end();) {
+    if (requested_output_names.find((*it)->Name()) ==
+        requested_output_names.end()) {
+      it = output_tensors_.erase(it);
+    } else {
+      it++;
+    }
+  }
+}
+
+std::unique_ptr<InferResponse>
+InferResponse::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t response_handle,
+    bool open_cuda_handle)
+{
+  AllocatedSharedMemory<char> response_shm =
+      shm_pool->Load<char>(response_handle);
+  ResponseShm* response_shm_ptr =
+      reinterpret_cast<ResponseShm*>(response_shm.data_.get());
+  uint32_t requested_output_count = response_shm_ptr->outputs_size;
+
+  std::shared_ptr<PbError> pb_error;
+  std::vector<std::shared_ptr<PbTensor>> output_tensors;
+  std::shared_ptr<PbString> parameters_shm;
+  std::string parameters;
+
+  // If the error field is set, do not load output tensors from shared memory.
+  if (response_shm_ptr->has_error && response_shm_ptr->is_error_set) {
+    pb_error = PbError::LoadFromSharedMemory(shm_pool, response_shm_ptr->error);
+  } else if (response_shm_ptr->has_error && !response_shm_ptr->is_error_set) {
+    pb_error =
+        std::make_shared<PbError>("Failed to retrieve the response error.");
+  } else {
+    bi::managed_external_buffer::handle_t* tensor_handle_shm =
+        reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+            response_shm.data_.get() + sizeof(ResponseShm));
+    {
+#ifdef TRITON_PB_STUB
+      // PbTensor::LoadFromSharedMemory() will construct Python objects if
+      // called from pb_stub, which requires holding the GIL.
+      py::gil_scoped_acquire acquire;
+#endif
+      for (size_t idx = 0; idx < requested_output_count; ++idx) {
+        std::shared_ptr<PbTensor> pb_tensor = PbTensor::LoadFromSharedMemory(
+            shm_pool, tensor_handle_shm[idx], open_cuda_handle);
+        output_tensors.emplace_back(std::move(pb_tensor));
+      }
+    }
+
+    parameters_shm = std::move(
+        PbString::LoadFromSharedMemory(shm_pool, response_shm_ptr->parameters));
+    parameters = parameters_shm->String();
+  }
+
+  return std::unique_ptr<InferResponse>(new InferResponse(
+      response_shm, output_tensors, pb_error,
+      response_shm_ptr->is_last_response, response_shm_ptr->id, parameters_shm,
+      parameters));
+}
+
+InferResponse::InferResponse(
+    AllocatedSharedMemory<char>& response_shm,
+    std::vector<std::shared_ptr<PbTensor>>& output_tensors,
+    std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
+    std::shared_ptr<PbString>& parameters_shm, std::string& parameters)
+{
+  response_shm_ = std::move(response_shm);
+  output_tensors_ = std::move(output_tensors);
+  error_ = std::move(pb_error);
+  shm_handle_ = response_shm_.handle_;
+  id_ = id;
+  is_last_response_ = is_last_response;
+  parameters_shm_ = std::move(parameters_shm);
+  parameters_ = std::move(parameters);
+}
+
+std::shared_ptr<PbError>&
+InferResponse::Error()
+{
+  return error_;
+}
+
+void*
+InferResponse::Id()
+{
+  return id_;
+}
+
+bool
+InferResponse::IsLastResponse()
+{
+  return is_last_response_;
+}
+
+#ifndef TRITON_PB_STUB
+void
+InferResponse::Send(
+    TRITONBACKEND_Response* response, void* cuda_stream,
+    bool& requires_deferred_callback, const uint32_t flags,
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    GPUBuffersHelper& gpu_buffer_helper,
+    std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
+    const std::set<std::string>& requested_output_names)
+{
+#ifdef TRITON_ENABLE_GPU
+  static bool log_warning = true;
+#endif  // TRITON_ENABLE_GPU
+
+  std::shared_ptr<TRITONSERVER_Error*> response_error =
+      WrapTritonErrorInSharedPtr(nullptr);
+  std::unique_ptr<ScopedDefer> response_error_handling;
+  requires_deferred_callback = false;
+
+  // This lambda expression will be called when this function exits, if the
+  // inference response doesn't have any GPU tensors. Otherwise, it will be
+  // called when the object is destructed or DeferredSendCallback is called.
+  response_error_handling =
+      std::make_unique<ScopedDefer>([response, response_error, flags] {
+        if (response != nullptr) {
+          LOG_IF_ERROR(
+              TRITONBACKEND_ResponseSend(response, flags, *response_error),
+              "failed to send the response.");
+        }
+      });
+
+  // Moves the response sending callback so that it is not called until the stub
+  // process fills in the GPU buffers.
+  ScopedDefer deferred_task([this, &requires_deferred_callback,
+                             &response_error_handling, &gpu_buffer_helper,
+                             response_error, &shm_pool] {
+    if (*response_error != nullptr) {
+      gpu_buffer_helper.SetError(
+          shm_pool, TRITONSERVER_ErrorMessage(*response_error));
+    }
+    if (requires_deferred_callback) {
+      deferred_send_callback_ = std::move(response_error_handling);
+    }
+  });
+
+  if (HasError()) {
+    *response_error =
+        TRITONSERVER_ErrorNew(Error()->Code(), Error()->Message().c_str());
+    return;
+  }
+
+  bool cuda_copy = false;
+
+  for (auto& output_tensor : OutputTensors()) {
+    // FIXME: for decoupled models we will skip the requested output names.
+    TRITONSERVER_MemoryType src_memory_type = output_tensor->MemoryType();
+    int64_t src_memory_type_id = output_tensor->MemoryTypeId();
+
+    TRITONSERVER_MemoryType actual_memory_type = src_memory_type;
+    int64_t actual_memory_type_id = src_memory_type_id;
+
+    if (actual_memory_type == TRITONSERVER_MEMORY_GPU) {
+      requires_deferred_callback = true;
+    }
+
+    TRITONBACKEND_Output* response_output;
+    SET_ERROR_AND_RETURN(
+        response_error,
+        TRITONBACKEND_ResponseOutput(
+            response, &response_output, output_tensor->Name().c_str(),
+            static_cast<TRITONSERVER_DataType>(output_tensor->TritonDtype()),
+            output_tensor->Dims().data(), output_tensor->Dims().size()));
+
+    void* triton_output_buffer;
+    SET_ERROR_AND_RETURN(
+        response_error,
+        TRITONBACKEND_OutputBuffer(
+            response_output, &triton_output_buffer, output_tensor->ByteSize(),
+            &actual_memory_type, &actual_memory_type_id));
+
+    bool cuda_used = false;
+    TRITONSERVER_BufferAttributes* output_buffer_attributes;
+    SET_ERROR_AND_RETURN(
+        response_error, TRITONBACKEND_OutputBufferAttributes(
+                            response_output, &output_buffer_attributes));
+
+    std::unique_ptr<PbMemory> output_buffer;
+    if (src_memory_type == TRITONSERVER_MEMORY_GPU &&
+        actual_memory_type == TRITONSERVER_MEMORY_GPU) {
+#ifdef TRITON_ENABLE_GPU
+      // Check if the triton-provided output buffer is using CUDA shared memory
+      // pool. If not, try to allocate a new buffer from the pool.
+      void* buffer = triton_output_buffer;
+      BackendMemory* backend_memory;
+      std::unique_ptr<BackendMemory> lbackend_memory;
+      std::unique_ptr<CUDAMemoryPoolManager>& cuda_pool =
+          shm_pool->GetCUDAMemoryPoolManager();
+      if (cuda_pool->UseCudaSharedPool(src_memory_type_id)) {
+        try {
+          if (!IsUsingCUDAPool(
+                  cuda_pool, actual_memory_type_id, triton_output_buffer)) {
+            THROW_IF_TRITON_ERROR(BackendMemory::Create(
+                reinterpret_cast<TRITONBACKEND_MemoryManager*>(
+                    shm_pool->GetCUDAMemoryPoolManager()
+                        ->TritonMemoryManager()),
+                BackendMemory::AllocationType::GPU_POOL, actual_memory_type_id,
+                output_tensor->ByteSize(), &backend_memory));
+            lbackend_memory.reset(backend_memory);
+            buffer = lbackend_memory->MemoryPtr();
+          }
+        }
+        catch (const PythonBackendException& pb_exception) {
+          if (log_warning) {
+            LOG_MESSAGE(
+                TRITONSERVER_LOG_WARN,
+                (std::string("Failed to allocate memory from CUDA memory pool "
+                             "for output tensor: ") +
+                 pb_exception.what() +
+                 std::string(", will use CUDA IPC for GPU output transfer."))
+                    .c_str());
+          }
+          log_warning = false;
+        }
+      }
+      cudaIpcMemHandle_t* cuda_ipc_mem_handle_p;
+      SET_ERROR_AND_RETURN(
+          response_error,
+          TRITONSERVER_BufferAttributesCudaIpcHandle(
+              output_buffer_attributes,
+              reinterpret_cast<void**>(&cuda_ipc_mem_handle_p)));
+
+      if (cuda_ipc_mem_handle_p != nullptr) {
+        SET_ERROR_AND_RETURN_IF_EXCEPTION(
+            response_error,
+            output_buffer = PbMemory::Create(
+                shm_pool, actual_memory_type, actual_memory_type_id,
+                output_tensor->ByteSize(), reinterpret_cast<char*>(buffer),
+                false /* copy_gpu */));
+        output_buffer->SetCudaIpcHandle(cuda_ipc_mem_handle_p);
+      } else {
+        SET_ERROR_AND_RETURN_IF_EXCEPTION(
+            response_error,
+            output_buffer = PbMemory::Create(
+                shm_pool, actual_memory_type, actual_memory_type_id,
+                output_tensor->ByteSize(), reinterpret_cast<char*>(buffer),
+                true /* copy_gpu */));
+      }
+
+      if (lbackend_memory != nullptr) {
+        output_buffer->SetBackendMemory(std::move(lbackend_memory));
+      }
+      gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle());
+      output_buffers.push_back(
+          {std::move(output_buffer), triton_output_buffer});
+#endif
+    }
+
+    // When we requested a GPU buffer but received a CPU buffer.
+    if (src_memory_type == TRITONSERVER_MEMORY_GPU &&
+        (actual_memory_type == TRITONSERVER_MEMORY_CPU ||
+         actual_memory_type == TRITONSERVER_MEMORY_CPU_PINNED)) {
+      SET_ERROR_AND_RETURN_IF_EXCEPTION(
+          response_error,
+          output_buffer = PbMemory::Create(
+              shm_pool, actual_memory_type, actual_memory_type_id,
+              output_tensor->ByteSize(), nullptr /* data ptr */));
+
+      gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle());
+      output_buffers.push_back(
+          {std::move(output_buffer), triton_output_buffer});
+    }
+
+    if (src_memory_type != TRITONSERVER_MEMORY_GPU) {
+      SET_ERROR_AND_RETURN(
+          response_error,
+          CopyBuffer(
+              "Failed to copy the output tensor to buffer.", src_memory_type,
+              src_memory_type_id, actual_memory_type, actual_memory_type_id,
+              output_tensor->ByteSize(), output_tensor->DataPtr(),
+              triton_output_buffer, reinterpret_cast<cudaStream_t>(cuda_stream),
+              &cuda_used));
+    }
+
+    cuda_copy |= cuda_used;
+  }
+
+  if (!parameters_.empty()) {
+    triton::common::TritonJson::Value param;
+    THROW_IF_TRITON_ERROR(
+        param.Parse(parameters_.c_str(), parameters_.length()));
+    std::vector<std::string> param_keys;
+    THROW_IF_TRITON_ERROR(param.Members(&param_keys));
+    for (const auto& key : param_keys) {
+      triton::common::TritonJson::Value value;
+      if (!param.Find(key.c_str(), &value)) {
+        throw PythonBackendException("Unexpected missing key on parameters");
+      }
+      if (value.IsString()) {
+        std::string string_value;
+        THROW_IF_TRITON_ERROR(value.AsString(&string_value));
+        THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetStringParameter(
+            response, key.c_str(), string_value.c_str()));
+      } else if (value.IsInt()) {
+        int64_t int_value = 0;
+        THROW_IF_TRITON_ERROR(value.AsInt(&int_value));
+        THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetIntParameter(
+            response, key.c_str(), int_value));
+      } else if (value.IsBool()) {
+        bool bool_value = false;
+        THROW_IF_TRITON_ERROR(value.AsBool(&bool_value));
+        THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetBoolParameter(
+            response, key.c_str(), bool_value));
+      } else {
+        throw PythonBackendException("Unsupported value type on parameters");
+      }
+    }
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream));
+  }
+#endif  // TRITON_ENABLE_GPU
+}
+#endif
+
+#ifndef TRITON_PB_STUB
+void
+InferResponse::DeferredSendCallback()
+{
+  deferred_send_callback_.reset();
+}
+#endif
+
+}}}  // namespace triton::backend::python
diff --git a/src/infer_response.h b/src/infer_response.h
new file mode 100644
index 00000000..ab8eb68a
--- /dev/null
+++ b/src/infer_response.h
@@ -0,0 +1,138 @@
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <future>
+
+#include "gpu_buffers.h"
+#include "pb_error.h"
+#include "pb_tensor.h"
+#include "pb_utils.h"
+#include "scoped_defer.h"
+
+namespace triton { namespace backend { namespace python {
+
+struct ResponseShm {
+  uint32_t outputs_size;
+  bi::managed_external_buffer::handle_t parameters;
+  bi::managed_external_buffer::handle_t error;
+  bool has_error;
+  // Indicates whether this error has a message or not.
+  bool is_error_set;
+  void* id;
+  bool is_last_response;
+};
+
+#define SET_ERROR_AND_RETURN(E, X)           \
+  do {                                       \
+    TRITONSERVER_Error* raasnie_err__ = (X); \
+    if (raasnie_err__ != nullptr) {          \
+      *E = raasnie_err__;                    \
+      return;                                \
+    }                                        \
+  } while (false)
+
+#define SET_ERROR_AND_RETURN_IF_EXCEPTION(E, X)                \
+  do {                                                         \
+    try {                                                      \
+      (X);                                                     \
+    }                                                          \
+    catch (const PythonBackendException& pb_exception) {       \
+      TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \
+          TRITONSERVER_ERROR_INTERNAL, pb_exception.what());   \
+      *E = rarie_err__;                                        \
+      return;                                                  \
+    }                                                          \
+  } while (false)
+
+class InferResponse {
+ public:
+  InferResponse(
+      const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
+      std::shared_ptr<PbError> error = nullptr, std::string parameters = "",
+      const bool is_last_response = true, void* id = nullptr);
+  std::vector<std::shared_ptr<PbTensor>>& OutputTensors();
+  const std::string& Parameters() const;  // JSON serializable unless empty
+  void SaveToSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu = true);
+  static std::unique_ptr<InferResponse> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t response_handle,
+      bool open_cuda_handle);
+  bool HasError();
+  std::shared_ptr<PbError>& Error();
+  bi::managed_external_buffer::handle_t ShmHandle();
+  void PruneOutputTensors(const std::set<std::string>& requested_output_names);
+  std::unique_ptr<std::future<std::unique_ptr<InferResponse>>>
+  GetNextResponse();
+  void SetNextResponseHandle(
+      bi::managed_external_buffer::handle_t next_response_handle);
+  bi::managed_external_buffer::handle_t NextResponseHandle();
+  void* Id();
+  bool IsLastResponse();
+
+#ifndef TRITON_PB_STUB
+  /// Send an inference response. If the response has a GPU tensor, sending the
+  /// response needs to be done in two step. The boolean
+  /// 'requires_deferred_callback' indicates whether DeferredSendCallback method
+  /// should be called or not.
+  void Send(
+      TRITONBACKEND_Response* response, void* cuda_stream,
+      bool& requires_deferred_callback, const uint32_t flags,
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      GPUBuffersHelper& gpu_buffer_helper,
+      std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
+      const std::set<std::string>& requested_output_names = {});
+
+  void DeferredSendCallback();
+#endif
+
+  // Disallow copying the inference response object.
+  DISALLOW_COPY_AND_ASSIGN(InferResponse);
+
+ private:
+  InferResponse(
+      AllocatedSharedMemory<char>& response_shm,
+      std::vector<std::shared_ptr<PbTensor>>& output_tensors,
+      std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
+      std::shared_ptr<PbString>& parameters_shm, std::string& parameters);
+  std::vector<std::shared_ptr<PbTensor>> output_tensors_;
+
+  std::shared_ptr<PbError> error_;
+  bi::managed_external_buffer::handle_t shm_handle_;
+  AllocatedSharedMemory<char> response_shm_;
+  std::vector<std::pair<std::unique_ptr<PbMemory>, void*>> gpu_output_buffers_;
+  std::unique_ptr<ScopedDefer> deferred_send_callback_;
+  bool is_last_response_;
+  // Representing the request id that the response was created from.
+  void* id_;
+
+  std::shared_ptr<PbString> parameters_shm_;
+  std::string parameters_;
+};
+
+}}}  // namespace triton::backend::python
diff --git a/src/infer_trace.cc b/src/infer_trace.cc
new file mode 100644
index 00000000..50645dcc
--- /dev/null
+++ b/src/infer_trace.cc
@@ -0,0 +1,101 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "infer_trace.h"
+
+namespace triton { namespace backend { namespace python {
+
+InferenceTrace::InferenceTrace(const InferenceTrace& rhs)
+{
+  triton_trace_ = rhs.triton_trace_;
+  trace_context_ = rhs.trace_context_;
+}
+
+InferenceTrace&
+InferenceTrace::operator=(const InferenceTrace& rhs)
+{
+  triton_trace_ = rhs.triton_trace_;
+  trace_context_ = rhs.trace_context_;
+  return *this;
+}
+
+InferenceTrace::InferenceTrace(std::unique_ptr<InferenceTrace>& trace_shm)
+{
+  triton_trace_ = trace_shm->triton_trace_;
+  trace_context_ = trace_shm->trace_context_;
+}
+
+void
+InferenceTrace::SaveToSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  AllocatedSharedMemory<InferenceTraceShm> infer_trace_shm =
+      shm_pool->Construct<InferenceTraceShm>();
+  infer_trace_shm_ptr_ = infer_trace_shm.data_.get();
+
+  infer_trace_shm_ptr_->triton_trace = triton_trace_;
+
+  std::unique_ptr<PbString> trace_context_shm =
+      PbString::Create(shm_pool, trace_context_);
+
+  infer_trace_shm_ptr_->trace_context_shm_handle =
+      trace_context_shm->ShmHandle();
+
+  // Save the references to shared memory.
+  trace_context_shm_ = std::move(trace_context_shm);
+  infer_trace_shm_ = std::move(infer_trace_shm);
+  shm_handle_ = infer_trace_shm_.handle_;
+}
+
+std::unique_ptr<InferenceTrace>
+InferenceTrace::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle)
+{
+  AllocatedSharedMemory<InferenceTraceShm> infer_trace_shm =
+      shm_pool->Load<InferenceTraceShm>(handle);
+  InferenceTraceShm* infer_trace_shm_ptr = infer_trace_shm.data_.get();
+
+  std::unique_ptr<PbString> trace_context_shm = PbString::LoadFromSharedMemory(
+      shm_pool, infer_trace_shm_ptr->trace_context_shm_handle);
+
+  return std::unique_ptr<InferenceTrace>(
+      new InferenceTrace(infer_trace_shm, trace_context_shm));
+}
+
+InferenceTrace::InferenceTrace(
+    AllocatedSharedMemory<InferenceTraceShm>& infer_trace_shm,
+    std::unique_ptr<PbString>& trace_context_shm)
+    : infer_trace_shm_(std::move(infer_trace_shm)),
+      trace_context_shm_(std::move(trace_context_shm))
+{
+  infer_trace_shm_ptr_ = infer_trace_shm_.data_.get();
+  shm_handle_ = infer_trace_shm_.handle_;
+  triton_trace_ = infer_trace_shm_ptr_->triton_trace;
+  trace_context_ = trace_context_shm_->String();
+}
+
+}}};  // namespace triton::backend::python
diff --git a/src/infer_trace.h b/src/infer_trace.h
new file mode 100644
index 00000000..aac9137f
--- /dev/null
+++ b/src/infer_trace.h
@@ -0,0 +1,90 @@
+// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <string>
+
+#include "pb_string.h"
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+
+struct InferenceTraceShm {
+  bi::managed_external_buffer::handle_t trace_context_shm_handle;
+  // The address of the 'TRITONSERVER_InferTrace' object.
+  void* triton_trace;
+};
+
+//
+// Inference Trace
+//
+class InferenceTrace {
+ public:
+  InferenceTrace(void* triton_trace, const std::string& ctxt)
+      : triton_trace_(triton_trace), trace_context_(ctxt)
+  {
+  }
+  InferenceTrace() : triton_trace_(nullptr), trace_context_("") {}
+  InferenceTrace(const InferenceTrace& rhs);
+  InferenceTrace(std::unique_ptr<InferenceTrace>& trace_shm);
+  InferenceTrace& operator=(const InferenceTrace& rhs);
+  /// Save InferenceTrace object to shared memory.
+  /// \param shm_pool Shared memory pool to save the InferenceTrace object.
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+
+  /// Create a InferenceTrace object from shared memory.
+  /// \param shm_pool Shared memory pool
+  /// \param handle Shared memory handle of the InferenceTrace.
+  /// \return Returns the InferenceTrace in the specified handle
+  /// location.
+  static std::unique_ptr<InferenceTrace> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+
+  void* TritonTrace() { return triton_trace_; }
+  const std::string& Context() const { return trace_context_; }
+
+  bi::managed_external_buffer::handle_t ShmHandle() { return shm_handle_; }
+
+ private:
+  // The private constructor for creating a InferenceTrace object from shared
+  // memory.
+  InferenceTrace(
+      AllocatedSharedMemory<InferenceTraceShm>& infer_trace_shm,
+      std::unique_ptr<PbString>& trace_context_shm);
+
+  void* triton_trace_;
+  std::string trace_context_;
+
+  // Shared Memory Data Structures
+  AllocatedSharedMemory<InferenceTraceShm> infer_trace_shm_;
+  InferenceTraceShm* infer_trace_shm_ptr_;
+  bi::managed_external_buffer::handle_t shm_handle_;
+  std::unique_ptr<PbString> trace_context_shm_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/ipc_message.cc b/src/ipc_message.cc
new file mode 100644
index 00000000..2fa13ba3
--- /dev/null
+++ b/src/ipc_message.cc
@@ -0,0 +1,159 @@
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "ipc_message.h"
+
+#include <memory>
+
+namespace triton { namespace backend { namespace python {
+std::unique_ptr<IPCMessage>
+IPCMessage::Create(
+    const std::unique_ptr<SharedMemoryManager>& shm_pool, bool inline_response)
+{
+  AllocatedSharedMemory<IPCMessageShm> ipc_message_shm =
+      shm_pool->Construct<IPCMessageShm>();
+
+  ipc_message_shm.data_->inline_response = inline_response;
+  AllocatedSharedMemory<bi::interprocess_mutex> response_mutex_shm;
+  AllocatedSharedMemory<bi::interprocess_condition> response_cond_shm;
+  if (inline_response) {
+    response_mutex_shm = std::move(shm_pool->Construct<bi::interprocess_mutex>(
+        1 /* count */, true /* aligned */));
+    response_cond_shm =
+        std::move(shm_pool->Construct<bi::interprocess_condition>(
+            1 /* count */, true /* aligned */));
+
+    ipc_message_shm.data_->response_mutex = response_mutex_shm.handle_;
+    ipc_message_shm.data_->response_cond = response_cond_shm.handle_;
+    new (response_mutex_shm.data_.get()) bi::interprocess_mutex{};
+    new (response_cond_shm.data_.get()) bi::interprocess_condition{};
+  }
+
+  return std::unique_ptr<IPCMessage>(
+      new IPCMessage(ipc_message_shm, response_mutex_shm, response_cond_shm));
+}
+
+std::unique_ptr<IPCMessage>
+IPCMessage::Create(
+    IPCMessageShm* ipc_message_shm,
+    bi::managed_external_buffer::handle_t& message_handle)
+{
+  return std::unique_ptr<IPCMessage>(
+      new IPCMessage(ipc_message_shm, message_handle));
+}
+
+AllocatedSharedMemory<IPCMessageShm>&
+IPCMessage::GetAllocatedSharedMemory()
+{
+  return ipc_message_shm_;
+}
+
+std::unique_ptr<IPCMessage>
+IPCMessage::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t message_handle)
+{
+  AllocatedSharedMemory<IPCMessageShm> ipc_message_shm =
+      shm_pool->Load<IPCMessageShm>(message_handle);
+
+  AllocatedSharedMemory<bi::interprocess_mutex> response_mutex_shm;
+  AllocatedSharedMemory<bi::interprocess_condition> response_cond_shm;
+  if (ipc_message_shm.data_->inline_response) {
+    response_mutex_shm = shm_pool->Load<bi::interprocess_mutex>(
+        ipc_message_shm.data_->response_mutex);
+    response_cond_shm = shm_pool->Load<bi::interprocess_condition>(
+        ipc_message_shm.data_->response_cond);
+  }
+
+  return std::unique_ptr<IPCMessage>(
+      new IPCMessage(ipc_message_shm, response_mutex_shm, response_cond_shm));
+}
+
+PYTHONSTUB_CommandType&
+IPCMessage::Command()
+{
+  return ipc_message_shm_ptr_->command;
+}
+
+bi::managed_external_buffer::handle_t&
+IPCMessage::Args()
+{
+  return ipc_message_shm_ptr_->args;
+}
+
+bool&
+IPCMessage::InlineResponse()
+{
+  return ipc_message_shm_ptr_->inline_response;
+}
+
+bi::interprocess_condition*
+IPCMessage::ResponseCondition()
+{
+  return response_cond_shm_ptr_;
+}
+
+bi::interprocess_mutex*
+IPCMessage::ResponseMutex()
+{
+  return response_mutex_shm_ptr_;
+}
+
+bi::managed_external_buffer::handle_t&
+IPCMessage::ResponseHandle()
+{
+  return ipc_message_shm_ptr_->response_handle;
+}
+
+bi::managed_external_buffer::handle_t
+IPCMessage::ShmHandle()
+{
+  return ipc_message_handle_;
+}
+
+IPCMessage::IPCMessage(
+    AllocatedSharedMemory<IPCMessageShm>& ipc_message_shm,
+    AllocatedSharedMemory<bi::interprocess_mutex>& response_mutex_shm,
+    AllocatedSharedMemory<bi::interprocess_condition>& response_cond_shm)
+    : ipc_message_shm_(std::move(ipc_message_shm)),
+      response_mutex_shm_(std::move(response_mutex_shm)),
+      response_cond_shm_(std::move(response_cond_shm))
+{
+  ipc_message_shm_ptr_ = ipc_message_shm_.data_.get();
+  response_mutex_shm_ptr_ = response_mutex_shm_.data_.get();
+  response_cond_shm_ptr_ = response_cond_shm_.data_.get();
+  ipc_message_handle_ = ipc_message_shm_.handle_;
+}
+
+IPCMessage::IPCMessage(
+    IPCMessageShm* ipc_message_shm,
+    bi::managed_external_buffer::handle_t& handle)
+{
+  ipc_message_handle_ = handle;
+  ipc_message_shm_ptr_ = ipc_message_shm;
+}
+
+}}};  // namespace triton::backend::python
diff --git a/src/ipc_message.h b/src/ipc_message.h
new file mode 100644
index 00000000..c0fab3a3
--- /dev/null
+++ b/src/ipc_message.h
@@ -0,0 +1,144 @@
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <boost/interprocess/sync/interprocess_condition.hpp>
+#include <boost/interprocess/sync/interprocess_mutex.hpp>
+
+#include "shm_manager.h"
+
+
+namespace triton { namespace backend { namespace python {
+
+namespace bi = boost::interprocess;
+
+typedef enum PYTHONSTUB_commandtype_enum {
+  PYTHONSTUB_ExecuteRequest,
+  PYTHONSTUB_ExecuteResponse,
+  PYTHONSTUB_InitializeRequest,
+  PYTHONSTUB_InitializeResponse,
+  PYTHONSTUB_CUDAPoolInitializeRequest,
+  PYTHONSTUB_FinalizeRequest,
+  PYTHONSTUB_FinalizeResponse,
+  PYTHONSTUB_LoadGPUBuffers,
+  PYTHONSTUB_InferExecRequest,
+  PYTHONSTUB_InferStreamExecRequest,
+  PYTHONSTUB_InferExecResponse,
+  PYTHONSTUB_InferStreamExecResponse,
+  PYTHONSTUB_ResponseSend,
+  PYTHONSTUB_ResponseClose,
+  PYTHONSTUB_AutoCompleteRequest,
+  PYTHONSTUB_AutoCompleteResponse,
+  PYTHONSTUB_LogRequest,
+  PYTHONSTUB_BLSDecoupledInferPayloadCleanup,
+  PYTHONSTUB_DecoupledResponseFactoryCleanup,
+  PYTHONSTUB_MetricFamilyRequestNew,
+  PYTHONSTUB_MetricFamilyRequestDelete,
+  PYTHONSTUB_MetricRequestNew,
+  PYTHONSTUB_MetricRequestDelete,
+  PYTHONSTUB_MetricRequestValue,
+  PYTHONSTUB_MetricRequestIncrement,
+  PYTHONSTUB_MetricRequestSet,
+  PYTHONSTUB_MetricRequestObserve,
+  PYTHONSTUB_LoadModelRequest,
+  PYTHONSTUB_UnloadModelRequest,
+  PYTHONSTUB_ModelReadinessRequest,
+  PYTHONSTUB_IsRequestCancelled,
+  PYTHONSTUB_CancelBLSInferRequest
+} PYTHONSTUB_CommandType;
+
+///
+/// Shared memory representation of IPCMessage
+///
+/// \param command determines the IPC command that is going to be passed.
+/// \param args determines the shared memory handle for the input parameters.
+/// \param inline_response determines whether this is a response of another IPC
+/// message. If this parameter is set, it must provide the handle of the
+/// corresponding request in \param response_handle.
+/// \param response_handle determines the request handle.
+/// \param response_mutex stores the handle for the mutex for the response
+/// object.
+/// \param response_cond stores the handle for the condition variable
+/// for the response object.
+struct IPCMessageShm {
+  PYTHONSTUB_CommandType command;
+  bi::managed_external_buffer::handle_t args;
+  bool inline_response = false;
+  bi::managed_external_buffer::handle_t response_handle;
+  bi::managed_external_buffer::handle_t response_mutex;
+  bi::managed_external_buffer::handle_t response_cond;
+};
+
+class IPCMessage {
+ public:
+  static std::unique_ptr<IPCMessage> Create(
+      const std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bool inline_response);
+
+  static std::unique_ptr<IPCMessage> Create(
+      IPCMessageShm* ipc_message_shm,
+      bi::managed_external_buffer::handle_t& message_handle);
+  static std::unique_ptr<IPCMessage> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t message_handle);
+
+  PYTHONSTUB_CommandType& Command();
+  bool& InlineResponse();
+  bi::managed_external_buffer::handle_t& ResponseHandle();
+  bi::interprocess_condition* ResponseCondition();
+  bi::interprocess_mutex* ResponseMutex();
+  bi::managed_external_buffer::handle_t& Args();
+  bi::managed_external_buffer::handle_t ShmHandle();
+  AllocatedSharedMemory<IPCMessageShm>& GetAllocatedSharedMemory();
+
+ private:
+  AllocatedSharedMemory<IPCMessageShm> ipc_message_shm_;
+  IPCMessageShm* ipc_message_shm_ptr_;
+
+  AllocatedSharedMemory<bi::interprocess_mutex> response_mutex_shm_;
+  bi::interprocess_mutex* response_mutex_shm_ptr_;
+
+  AllocatedSharedMemory<bi::interprocess_condition> response_cond_shm_;
+  bi::interprocess_condition* response_cond_shm_ptr_;
+
+  bi::managed_external_buffer::handle_t ipc_message_handle_;
+
+  /// Create/load a IPCMessage shm object.
+  /// \param ipc_message_shm IPCMessage representation in shared memory.
+  /// \param response_mutex_shm response mutex.
+  /// \param response_condition_shm response condition.
+  IPCMessage(
+      AllocatedSharedMemory<IPCMessageShm>& ipc_message_shm,
+      AllocatedSharedMemory<bi::interprocess_mutex>& response_mutex_shm,
+      AllocatedSharedMemory<bi::interprocess_condition>& response_cond_shm);
+
+  IPCMessage(
+      IPCMessageShm* ipc_message_shm,
+      bi::managed_external_buffer::handle_t& handle);
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/memory_manager.cc b/src/memory_manager.cc
new file mode 100644
index 00000000..716dee9e
--- /dev/null
+++ b/src/memory_manager.cc
@@ -0,0 +1,112 @@
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "memory_manager.h"
+
+#include "pb_utils.h"
+
+
+namespace triton { namespace backend { namespace python {
+
+
+#ifdef TRITON_ENABLE_GPU
+BackendMemoryRecord::BackendMemoryRecord(
+    std::unique_ptr<BackendMemory> backend_memory)
+    : backend_memory_(std::move(backend_memory))
+{
+  release_callback_ = [](void* ptr) {
+    // Do nothing. The backend_memory_ will be destroyed in the destructor.
+  };
+}
+
+void*
+BackendMemoryRecord::MemoryId()
+{
+  return reinterpret_cast<void*>(backend_memory_->MemoryPtr());
+}
+
+const std::function<void(void*)>&
+BackendMemoryRecord::ReleaseCallback()
+{
+  return release_callback_;
+}
+#endif
+
+MemoryManager::MemoryManager(
+    std::unique_ptr<MessageQueue<intptr_t>>&& memory_message_queue)
+{
+  message_queue_ = std::move(memory_message_queue);
+  thread_ = std::thread(&MemoryManager::QueueMonitorThread, this);
+}
+
+intptr_t
+MemoryManager::AddRecord(std::unique_ptr<MemoryRecord>&& memory_record)
+{
+  std::lock_guard<std::mutex> lock{mu_};
+
+  intptr_t memory_record_id =
+      reinterpret_cast<intptr_t>(memory_record->MemoryId());
+  records_.emplace(memory_record_id, std::move(memory_record));
+
+  return memory_record_id;
+}
+
+void
+MemoryManager::QueueMonitorThread()
+{
+  while (true) {
+    intptr_t memory = message_queue_->Pop();
+    if (memory == 0) {
+      return;
+    }
+
+    {
+      std::lock_guard<std::mutex> lock{mu_};
+      auto it = records_.find(memory);
+      if (it == records_.end()) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_ERROR,
+            "Unexpected memory index received for deallocation.");
+        continue;
+      }
+
+      // Call the release callback.
+      it->second->ReleaseCallback()(it->second->MemoryId());
+      // it->second.reset();
+      records_.erase(it);
+    }
+  }
+}
+
+MemoryManager::~MemoryManager()
+{
+  // Push a dummy message that will trigger the destruction of the background
+  // thread.
+  message_queue_->Push(DUMMY_MESSAGE);
+  thread_.join();
+}
+
+}}};  // namespace triton::backend::python
diff --git a/src/memory_manager.h b/src/memory_manager.h
new file mode 100644
index 00000000..5b7e35f5
--- /dev/null
+++ b/src/memory_manager.h
@@ -0,0 +1,86 @@
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <functional>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include "message_queue.h"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/core/tritonserver.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+
+namespace triton { namespace backend { namespace python {
+
+class MemoryRecord {
+ public:
+  virtual const std::function<void(void*)>& ReleaseCallback() = 0;
+  virtual void* MemoryId() = 0;
+  virtual ~MemoryRecord() = default;
+};
+
+#ifdef TRITON_ENABLE_GPU
+class BackendMemoryRecord : public MemoryRecord {
+ public:
+  BackendMemoryRecord(std::unique_ptr<BackendMemory> backend_memory);
+  const std::function<void(void*)>& ReleaseCallback() override;
+  void* MemoryId() override;
+  ~BackendMemoryRecord() { backend_memory_.reset(); }
+
+ private:
+  std::unique_ptr<BackendMemory> backend_memory_;
+  std::function<void(void*)> release_callback_;
+};
+#endif
+
+/// Memory manager class is used primarily for managing the lifetime of GPU
+/// tensors in BLS. It mainly consists of a background thread that monitors a
+/// message queue in shared memory. Whenever a GPU tensor is created, it will
+/// be pushed to the memory manager. The stub process must send a message to the
+/// message queue asking the memory manager to deallocate the GPU tensor.
+class MemoryManager {
+ public:
+  MemoryManager(std::unique_ptr<MessageQueue<intptr_t>>&& memory_message_queue);
+  intptr_t AddRecord(std::unique_ptr<MemoryRecord>&& memory_record);
+  TRITONSERVER_Error* ResetCounter();
+  ~MemoryManager();
+
+ private:
+  std::thread thread_;
+  std::unordered_map<intptr_t, std::unique_ptr<MemoryRecord>> records_;
+  std::unique_ptr<MessageQueue<intptr_t>> message_queue_;
+  void QueueMonitorThread();
+  std::mutex mu_;
+};
+}}};  // namespace triton::backend::python
diff --git a/src/message_queue.h b/src/message_queue.h
new file mode 100644
index 00000000..06661c66
--- /dev/null
+++ b/src/message_queue.h
@@ -0,0 +1,326 @@
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <boost/interprocess/sync/interprocess_mutex.hpp>
+#include <boost/interprocess/sync/interprocess_semaphore.hpp>
+#include <boost/interprocess/sync/scoped_lock.hpp>
+#include <boost/thread/thread_time.hpp>
+#include <cstddef>
+
+#include "pb_utils.h"
+#include "shm_manager.h"
+#ifdef TRITON_PB_STUB
+#include "pb_stub_log.h"
+#endif
+
+namespace triton { namespace backend { namespace python {
+namespace bi = boost::interprocess;
+
+/// Struct holding the representation of a message queue inside the shared
+/// memory.
+/// \param size Total size of the message queue. Considered invalid after
+/// MessageQueue::LoadFromSharedMemory. Check DLIS-8378 for additional details.
+/// \param mutex Handle of the mutex variable protecting index.
+/// \param index Used element index.
+/// \param sem_empty Semaphore object counting the number of empty buffer slots.
+/// \param sem_full Semaphore object counting the number of used buffer slots.
+struct MessageQueueShm {
+  bi::interprocess_semaphore sem_empty{0};
+  bi::interprocess_semaphore sem_full{0};
+  bi::interprocess_mutex mutex;
+  std::size_t size;
+  bi::managed_external_buffer::handle_t buffer;
+  int head;
+  int tail;
+};
+
+template <typename T>
+class MessageQueue {
+ public:
+  /// Create a new MessageQueue in the shared memory.
+  static std::unique_ptr<MessageQueue<T>> Create(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      uint32_t message_queue_size)
+  {
+    AllocatedSharedMemory<MessageQueueShm> mq_shm =
+        shm_pool->Construct<MessageQueueShm>();
+    mq_shm.data_->size = message_queue_size;
+
+    AllocatedSharedMemory<T> mq_buffer_shm =
+        shm_pool->Construct<T>(message_queue_size /* count */);
+    mq_shm.data_->buffer = mq_buffer_shm.handle_;
+    mq_shm.data_->head = 0;
+    mq_shm.data_->tail = 0;
+
+    new (&(mq_shm.data_->mutex)) bi::interprocess_mutex{};
+    new (&(mq_shm.data_->sem_empty))
+        bi::interprocess_semaphore{message_queue_size};
+    new (&(mq_shm.data_->sem_full)) bi::interprocess_semaphore{0};
+
+    return std::unique_ptr<MessageQueue<T>>(
+        new MessageQueue<T>(mq_shm, mq_buffer_shm));
+  }
+
+  /// Load an already existing message queue from the shared memory.
+  static std::unique_ptr<MessageQueue<T>> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t message_queue_handle)
+  {
+    AllocatedSharedMemory<MessageQueueShm> mq_shm =
+        shm_pool->Load<MessageQueueShm>(message_queue_handle);
+    AllocatedSharedMemory<T> mq_shm_buffer =
+        shm_pool->Load<T>(mq_shm.data_->buffer);
+
+    return std::unique_ptr<MessageQueue<T>>(
+        new MessageQueue(mq_shm, mq_shm_buffer));
+  }
+
+  /// Push a message inside the message queue.
+  /// \param message The shared memory handle of the message.
+  void Push(T message)
+  {
+    while (true) {
+      try {
+        SemEmptyMutable()->wait();
+        break;
+      }
+      catch (bi::interprocess_exception& ex) {
+      }
+    }
+
+    {
+      bi::scoped_lock<bi::interprocess_mutex> lock{*MutexMutable()};
+      int head_idx = Head();
+      // Additional check to avoid out of bounds read/write. Check DLIS-8378 for
+      // additional details.
+      if (head_idx < 0 || static_cast<uint32_t>(head_idx) >= Size()) {
+        std::string error_msg =
+            "internal error: message queue head index out of bounds. Expects "
+            "positive integer less than the size of message queue " +
+            std::to_string(Size()) + " but got " + std::to_string(head_idx);
+#ifdef TRITON_PB_STUB
+        LOG_ERROR << error_msg;
+#else
+        LOG_MESSAGE(TRITONSERVER_LOG_ERROR, error_msg.c_str());
+#endif
+        return;
+      }
+      Buffer()[head_idx] = message;
+      HeadIncrement();
+    }
+    SemFullMutable()->post();
+  }
+
+  void Push(T message, int const& duration, bool& success)
+  {
+    boost::system_time timeout =
+        boost::get_system_time() + boost::posix_time::milliseconds(duration);
+
+    while (true) {
+      try {
+        if (!SemEmptyMutable()->timed_wait(timeout)) {
+          success = false;
+          return;
+        } else {
+          break;
+        }
+      }
+      catch (bi::interprocess_exception& ex) {
+      }
+    }
+
+    {
+      timeout =
+          boost::get_system_time() + boost::posix_time::milliseconds(duration);
+      bi::scoped_lock<bi::interprocess_mutex> lock{*MutexMutable(), timeout};
+      if (!lock) {
+        SemEmptyMutable()->post();
+        success = false;
+        return;
+      }
+      success = true;
+
+      int head_idx = Head();
+      // Additional check to avoid out of bounds read/write. Check DLIS-8378 for
+      // additional details.
+      if (head_idx < 0 || static_cast<uint32_t>(head_idx) >= Size()) {
+        std::string error_msg =
+            "internal error: message queue head index out of bounds. Expects "
+            "positive integer less than the size of message queue " +
+            std::to_string(Size()) + " but got " + std::to_string(head_idx);
+#ifdef TRITON_PB_STUB
+        LOG_ERROR << error_msg;
+#else
+        LOG_MESSAGE(TRITONSERVER_LOG_ERROR, error_msg.c_str());
+#endif
+        return;
+      }
+      Buffer()[head_idx] = message;
+      HeadIncrement();
+    }
+    SemFullMutable()->post();
+  }
+
+  /// Pop a message from the message queue. This call will block until there
+  /// is a message inside the message queue to return.
+  /// \return the handle of the new message.
+  T Pop()
+  {
+    T message;
+
+    while (true) {
+      try {
+        SemFullMutable()->wait();
+        break;
+      }
+      catch (bi::interprocess_exception& ex) {
+      }
+    }
+
+    {
+      bi::scoped_lock<bi::interprocess_mutex> lock{*MutexMutable()};
+
+      message = Buffer()[Tail()];
+      TailIncrement();
+    }
+    SemEmptyMutable()->post();
+
+    return message;
+  }
+
+  T Pop(int const& duration, bool& success)
+  {
+    T message = 0;
+    boost::system_time timeout =
+        boost::get_system_time() + boost::posix_time::milliseconds(duration);
+
+    while (true) {
+      try {
+        if (!SemFullMutable()->timed_wait(timeout)) {
+          success = false;
+          return message;
+        } else {
+          break;
+        }
+      }
+      catch (bi::interprocess_exception& ex) {
+      }
+    }
+
+    {
+      timeout =
+          boost::get_system_time() + boost::posix_time::milliseconds(duration);
+      bi::scoped_lock<bi::interprocess_mutex> lock{*MutexMutable(), timeout};
+      if (!lock) {
+        SemFullMutable()->post();
+        success = false;
+        return message;
+      }
+      success = true;
+
+      message = Buffer()[Tail()];
+      TailIncrement();
+    }
+    SemEmptyMutable()->post();
+
+    return message;
+  }
+
+  /// Resets the semaphores for the message queue. This function is useful for
+  /// when the stub process may have exited unexpectedly and the semaphores need
+  /// to be restarted so that the message queue is in a proper state.
+  void ResetSemaphores()
+  {
+    new (SemFullMutable()) bi::interprocess_semaphore(0);
+    new (SemEmptyMutable()) bi::interprocess_semaphore(Size());
+    new (MutexMutable()) bi::interprocess_mutex;
+    mq_shm_ptr_->tail = 0;
+    mq_shm_ptr_->head = 0;
+  }
+
+  /// Get the shared memory handle of MessageQueue
+  bi::managed_external_buffer::handle_t ShmHandle() { return mq_handle_; }
+
+  /// Release the ownership of this object in shared memory.
+  void Release()
+  {
+    if (mq_shm_.data_ != nullptr) {
+      mq_shm_.data_.release();
+    }
+
+    if (mq_buffer_shm_.data_ != nullptr) {
+      mq_buffer_shm_.data_.release();
+    }
+  }
+
+ private:
+  uint32_t Size() { return size_; }
+  const bi::interprocess_mutex& Mutex() { return mq_shm_ptr_->mutex; }
+  bi::interprocess_mutex* MutexMutable() { return &(mq_shm_ptr_->mutex); }
+  int& Head() { return mq_shm_ptr_->head; }
+  int& Tail() { return mq_shm_ptr_->tail; }
+  T* Buffer() { return mq_buffer_shm_ptr_; }
+  const bi::interprocess_semaphore& SemEmpty()
+  {
+    return mq_shm_ptr_->sem_empty;
+  }
+  bi::interprocess_semaphore* SemEmptyMutable()
+  {
+    return &(mq_shm_ptr_->sem_empty);
+  }
+  const bi::interprocess_semaphore& SemFull() { return mq_shm_ptr_->sem_full; }
+  bi::interprocess_semaphore* SemFullMutable()
+  {
+    return &(mq_shm_ptr_->sem_full);
+  }
+
+  void HeadIncrement() { mq_shm_ptr_->head = (mq_shm_ptr_->head + 1) % Size(); }
+  void TailIncrement() { mq_shm_ptr_->tail = (mq_shm_ptr_->tail + 1) % Size(); }
+
+  AllocatedSharedMemory<MessageQueueShm> mq_shm_;
+  AllocatedSharedMemory<T> mq_buffer_shm_;
+
+  MessageQueueShm* mq_shm_ptr_;
+  T* mq_buffer_shm_ptr_;
+  bi::managed_external_buffer::handle_t mq_handle_;
+  uint32_t size_;
+
+  /// Create/load a Message queue.
+  /// \param mq_shm Message queue representation in shared memory.
+  MessageQueue(
+      AllocatedSharedMemory<MessageQueueShm>& mq_shm,
+      AllocatedSharedMemory<T>& mq_buffer_shm)
+      : mq_shm_(std::move(mq_shm)), mq_buffer_shm_(std::move(mq_buffer_shm))
+  {
+    mq_buffer_shm_ptr_ = mq_buffer_shm_.data_.get();
+    mq_shm_ptr_ = mq_shm_.data_.get();
+    mq_handle_ = mq_shm_.handle_;
+    size_ = mq_shm_ptr_->size;
+  }
+};
+}}}  // namespace triton::backend::python
diff --git a/src/metric.cc b/src/metric.cc
new file mode 100644
index 00000000..4c055910
--- /dev/null
+++ b/src/metric.cc
@@ -0,0 +1,394 @@
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "metric.h"
+
+#ifdef TRITON_PB_STUB
+#include "pb_stub.h"
+#endif
+
+namespace triton { namespace backend { namespace python {
+
+Metric::Metric(
+    const std::string& labels, std::optional<const std::vector<double>> buckets,
+    void* metric_family_address)
+    : labels_(labels), buckets_(buckets), operation_value_(0),
+      metric_address_(nullptr), metric_family_address_(metric_family_address),
+      is_cleared_(false)
+{
+#ifdef TRITON_PB_STUB
+  SendCreateMetricRequest();
+#endif
+}
+
+Metric::~Metric()
+{
+#ifdef TRITON_PB_STUB
+  Clear();
+#endif
+}
+
+void
+Metric::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  AllocatedSharedMemory<MetricShm> custom_metric_shm =
+      shm_pool->Construct<MetricShm>();
+  custom_metric_shm_ptr_ = custom_metric_shm.data_.get();
+
+  std::unique_ptr<PbString> labels_shm = PbString::Create(shm_pool, labels_);
+
+  custom_metric_shm_ptr_->operation_value = operation_value_;
+  custom_metric_shm_ptr_->labels_shm_handle = labels_shm->ShmHandle();
+  custom_metric_shm_ptr_->metric_family_address = metric_family_address_;
+  custom_metric_shm_ptr_->metric_address = metric_address_;
+
+  // Histogram specific case
+  if (buckets_.has_value()) {
+    auto buckets_size = buckets_.value().size() * sizeof(double);
+    std::unique_ptr<PbMemory> buckets_shm = PbMemory::Create(
+        shm_pool, TRITONSERVER_MemoryType::TRITONSERVER_MEMORY_CPU, 0,
+        buckets_size, reinterpret_cast<char*>(buckets_.value().data()),
+        false /* copy_gpu */);
+    custom_metric_shm_ptr_->buckets_shm_handle = buckets_shm->ShmHandle();
+    buckets_shm_ = std::move(buckets_shm);
+  } else {
+    custom_metric_shm_ptr_->buckets_shm_handle = 0;
+    buckets_shm_ = nullptr;
+  }
+
+  // Save the references to shared memory.
+  custom_metric_shm_ = std::move(custom_metric_shm);
+  labels_shm_ = std::move(labels_shm);
+  shm_handle_ = custom_metric_shm_.handle_;
+}
+
+std::unique_ptr<Metric>
+Metric::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle)
+{
+  AllocatedSharedMemory<MetricShm> custom_metric_shm =
+      shm_pool->Load<MetricShm>(handle);
+  MetricShm* custom_metric_shm_ptr = custom_metric_shm.data_.get();
+
+  std::unique_ptr<PbString> labels_shm = PbString::LoadFromSharedMemory(
+      shm_pool, custom_metric_shm_ptr->labels_shm_handle);
+
+  std::unique_ptr<PbMemory> buckets_shm = nullptr;
+  if (custom_metric_shm_ptr->buckets_shm_handle != 0) {
+    buckets_shm = PbMemory::LoadFromSharedMemory(
+        shm_pool, custom_metric_shm_ptr->buckets_shm_handle,
+        false /* open_cuda_handle */);
+  }
+
+  return std::unique_ptr<Metric>(
+      new Metric(custom_metric_shm, labels_shm, buckets_shm));
+}
+
+Metric::Metric(
+    AllocatedSharedMemory<MetricShm>& custom_metric_shm,
+    std::unique_ptr<PbString>& labels_shm,
+    std::unique_ptr<PbMemory>& buckets_shm)
+    : custom_metric_shm_(std::move(custom_metric_shm)),
+      labels_shm_(std::move(labels_shm)), buckets_shm_(std::move(buckets_shm))
+{
+  custom_metric_shm_ptr_ = custom_metric_shm_.data_.get();
+
+  // FIXME: This constructor is called during each
+  // set/increment/observe/get_value call. It only needs the pointers.
+  labels_ = labels_shm_->String();
+  if (buckets_shm_ != nullptr) {  // Histogram
+    size_t bucket_size = buckets_shm_->ByteSize() / sizeof(double);
+    std::vector<double> buckets;
+    buckets.reserve(bucket_size);
+    for (size_t i = 0; i < bucket_size; ++i) {
+      buckets.emplace_back(
+          reinterpret_cast<double*>(buckets_shm_->DataPtr())[i]);
+    }
+    buckets_ = std::move(buckets);
+  }
+
+  operation_value_ = custom_metric_shm_ptr_->operation_value;
+  metric_family_address_ = custom_metric_shm_ptr_->metric_family_address;
+  metric_address_ = custom_metric_shm_ptr_->metric_address;
+}
+
+void*
+Metric::MetricAddress()
+{
+  return metric_address_;
+}
+
+#ifdef TRITON_PB_STUB
+void
+Metric::SendCreateMetricRequest()
+{
+  // Send the request to create the Metric to the parent process
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  SaveToSharedMemory(stub->ShmPool());
+  CustomMetricsMessage* custom_metrics_msg = nullptr;
+  AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
+  try {
+    stub->SendMessage<CustomMetricsMessage>(
+        custom_metrics_shm, PYTHONSTUB_MetricRequestNew, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    throw PythonBackendException(
+        "Error when creating Metric: " + std::string(pb_exception.what()));
+  }
+
+  custom_metrics_msg = custom_metrics_shm.data_.get();
+  metric_address_ = custom_metrics_msg->address;
+}
+
+void
+Metric::SendIncrementRequest(const double& value)
+{
+  py::gil_scoped_release release;
+  try {
+    CheckIfCleared();
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    operation_value_ = value;
+    SaveToSharedMemory(stub->ShmPool());
+    AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
+    stub->SendMessage<CustomMetricsMessage>(
+        custom_metrics_shm, PYTHONSTUB_MetricRequestIncrement, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    throw PythonBackendException(
+        "Failed to increment metric value: " +
+        std::string(pb_exception.what()));
+  }
+}
+
+void
+Metric::SendSetValueRequest(const double& value)
+{
+  try {
+    CheckIfCleared();
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    operation_value_ = value;
+    SaveToSharedMemory(stub->ShmPool());
+    AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
+    stub->SendMessage<CustomMetricsMessage>(
+        custom_metrics_shm, PYTHONSTUB_MetricRequestSet, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    throw PythonBackendException(
+        "Failed to set metric value: " + std::string(pb_exception.what()));
+  }
+}
+
+void
+Metric::SendObserveRequest(const double& value)
+{
+  py::gil_scoped_release release;
+  try {
+    CheckIfCleared();
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    operation_value_ = value;
+    SaveToSharedMemory(stub->ShmPool());
+    AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
+    stub->SendMessage<CustomMetricsMessage>(
+        custom_metrics_shm, PYTHONSTUB_MetricRequestObserve, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    throw PythonBackendException(
+        "Failed to observe metric value: " + std::string(pb_exception.what()));
+  }
+}
+
+double
+Metric::SendGetValueRequest()
+{
+  CustomMetricsMessage* custom_metrics_msg = nullptr;
+  AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
+  try {
+    CheckIfCleared();
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    SaveToSharedMemory(stub->ShmPool());
+    stub->SendMessage<CustomMetricsMessage>(
+        custom_metrics_shm, PYTHONSTUB_MetricRequestValue, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    throw PythonBackendException(
+        "Failed to get metric value: " + std::string(pb_exception.what()));
+  }
+
+  custom_metrics_msg = custom_metrics_shm.data_.get();
+  return custom_metrics_msg->value;
+}
+
+void
+Metric::Clear()
+{
+  // Need to check if the metric has been cleared before as the Clear()'
+  // function can be called from two different locations: when the metric family
+  // clears the 'metric_map_' and when the 'Metric' object goes out of
+  // scope/being deleted.
+  if (!is_cleared_) {
+    is_cleared_ = true;
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    SaveToSharedMemory(stub->ShmPool());
+    AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
+    try {
+      stub->SendMessage<CustomMetricsMessage>(
+          custom_metrics_shm, PYTHONSTUB_MetricRequestDelete, shm_handle_);
+    }
+    catch (const PythonBackendException& pb_exception) {
+      std::cerr << "Error when deleting Metric: " << pb_exception.what()
+                << "\n";
+    }
+  }
+}
+
+void
+Metric::CheckIfCleared()
+{
+  if (is_cleared_) {
+    throw PythonBackendException(
+        "Invalid metric operation as the corresponding 'MetricFamily' has been "
+        "deleted. The 'MetricFamily' object should be deleted AFTER its "
+        "corresponding 'Metric' objects have been deleted.");
+  }
+}
+
+#else
+void*
+Metric::InitializeTritonMetric()
+{
+  std::vector<const TRITONSERVER_Parameter*> labels_params;
+  ParseLabels(labels_params, labels_);
+  TRITONSERVER_MetricKind kind;
+  THROW_IF_TRITON_ERROR(TRITONSERVER_GetMetricFamilyKind(
+      reinterpret_cast<TRITONSERVER_MetricFamily*>(metric_family_address_),
+      &kind));
+  TRITONSERVER_MetricArgs* args = nullptr;
+  switch (kind) {
+    case TRITONSERVER_METRIC_KIND_COUNTER:
+    case TRITONSERVER_METRIC_KIND_GAUGE:
+      break;
+    case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
+      const std::vector<double>& buckets = buckets_.value();
+      THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsNew(&args));
+      THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsSetHistogram(
+          args, buckets.data(), buckets.size()));
+      break;
+    }
+    default:
+      break;
+  }
+
+  TRITONSERVER_Metric* triton_metric = nullptr;
+  THROW_IF_TRITON_ERROR(TRITONSERVER_MetricNewWithArgs(
+      &triton_metric,
+      reinterpret_cast<TRITONSERVER_MetricFamily*>(metric_family_address_),
+      labels_params.data(), labels_params.size(), args));
+  for (const auto label : labels_params) {
+    TRITONSERVER_ParameterDelete(const_cast<TRITONSERVER_Parameter*>(label));
+  }
+  THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsDelete(args));
+  return reinterpret_cast<void*>(triton_metric);
+}
+
+void
+Metric::ParseLabels(
+    std::vector<const TRITONSERVER_Parameter*>& labels_params,
+    const std::string& labels)
+{
+  triton::common::TritonJson::Value labels_json;
+  THROW_IF_TRITON_ERROR(labels_json.Parse(labels));
+
+  std::vector<std::string> members;
+  labels_json.Members(&members);
+  for (const auto& member : members) {
+    std::string value;
+    THROW_IF_TRITON_ERROR(labels_json.MemberAsString(member.c_str(), &value));
+    labels_params.emplace_back(TRITONSERVER_ParameterNew(
+        member.c_str(), TRITONSERVER_PARAMETER_STRING, value.c_str()));
+  }
+}
+
+void
+Metric::HandleMetricOperation(
+    CustomMetricsMessage* metrics_message_ptr,
+    const PYTHONSTUB_CommandType& command_type)
+{
+  if (command_type == PYTHONSTUB_MetricRequestValue) {
+    metrics_message_ptr->value = GetValue();
+  } else if (command_type == PYTHONSTUB_MetricRequestIncrement) {
+    Increment(operation_value_);
+  } else if (command_type == PYTHONSTUB_MetricRequestSet) {
+    SetValue(operation_value_);
+  } else if (command_type == PYTHONSTUB_MetricRequestObserve) {
+    Observe(operation_value_);
+  } else {
+    throw PythonBackendException("Unknown metric operation");
+  }
+}
+
+void
+Metric::Increment(const double& value)
+{
+  auto triton_metric = reinterpret_cast<TRITONSERVER_Metric*>(metric_address_);
+  THROW_IF_TRITON_ERROR(TRITONSERVER_MetricIncrement(triton_metric, value));
+}
+
+void
+Metric::SetValue(const double& value)
+{
+  auto triton_metric = reinterpret_cast<TRITONSERVER_Metric*>(metric_address_);
+  THROW_IF_TRITON_ERROR(TRITONSERVER_MetricSet(triton_metric, value));
+}
+
+void
+Metric::Observe(const double& value)
+{
+  auto triton_metric = reinterpret_cast<TRITONSERVER_Metric*>(metric_address_);
+  THROW_IF_TRITON_ERROR(TRITONSERVER_MetricObserve(triton_metric, value));
+}
+
+double
+Metric::GetValue()
+{
+  double value;
+  auto triton_metric = reinterpret_cast<TRITONSERVER_Metric*>(metric_address_);
+  THROW_IF_TRITON_ERROR(TRITONSERVER_MetricValue(triton_metric, &value));
+  return value;
+}
+
+void
+Metric::ClearTritonMetric()
+{
+  auto triton_metric = reinterpret_cast<TRITONSERVER_Metric*>(metric_address_);
+  if (triton_metric != nullptr) {
+    LOG_IF_ERROR(TRITONSERVER_MetricDelete(triton_metric), "deleting metric");
+  }
+}
+
+#endif
+
+}}}  // namespace triton::backend::python
diff --git a/src/metric.h b/src/metric.h
new file mode 100644
index 00000000..cd54ca54
--- /dev/null
+++ b/src/metric.h
@@ -0,0 +1,193 @@
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <optional>
+#include <string>
+
+#include "ipc_message.h"
+#include "pb_memory.h"
+#include "pb_string.h"
+#include "pb_utils.h"
+
+#ifdef TRITON_PB_STUB
+#include <pybind11/embed.h>
+namespace py = pybind11;
+#else
+#include "triton/core/tritonserver.h"
+#endif
+
+namespace triton { namespace backend { namespace python {
+
+// The 'MetricShm' struct is utilized by the 'Metric' class for saving the
+// essential data to shared memory and for loading the data from shared memory
+// in order to reconstruct the 'Metric' object.
+struct MetricShm {
+  // The shared memory handle of the labels in PbString format.
+  bi::managed_external_buffer::handle_t labels_shm_handle;
+  // The shared memory handle of the buckets in PbMemory format.
+  bi::managed_external_buffer::handle_t buckets_shm_handle;
+  // The value used for incrementing or setting the metric.
+  double operation_value;
+  // The address of the TRITONSERVER_Metric object.
+  void* metric_address;
+  // The address corresponds to the TRITONSERVER_MetricFamily object that this
+  // metric belongs to.
+  void* metric_family_address;
+};
+
+class Metric {
+ public:
+  Metric(
+      const std::string& labels,
+      std::optional<const std::vector<double>> buckets,
+      void* metric_family_address);
+
+  ~Metric();
+
+  /// Save Custom Metric object to shared memory.
+  /// \param shm_pool Shared memory pool to save the custom metric object.
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+
+  /// Create a Custom Metric object from shared memory.
+  /// \param shm_pool Shared memory pool
+  /// \param handle Shared memory handle of the custom metric.
+  /// \return Returns the custom metrics in the specified request_handle
+  /// location.
+  static std::unique_ptr<Metric> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+
+  /// Get the address of the TRITONSERVER_Metric object.
+  /// \return Returns the address of the TRITONSERVER_Metric object.
+  void* MetricAddress();
+
+  /// Send the request to the parent process to delete the Metric object.
+  void Clear();
+
+#ifdef TRITON_PB_STUB
+  /// Send a request to register a new 'TRITONSERVER_Metric' object to the
+  /// parent process.
+  void SendCreateMetricRequest();
+
+  /// Send the request to the parent process to increment the metric by the
+  /// specified value.
+  /// \param value The value to increment the metric by.
+  void SendIncrementRequest(const double& value);
+
+  /// Send the request to the parent process to set the metric to the specified
+  /// value.
+  /// \param value The value to set the metric to.
+  void SendSetValueRequest(const double& value);
+
+  /// Send the request to the parent process to observe the value to the metric.
+  /// \param value The value to set the metric to.
+  void SendObserveRequest(const double& value);
+
+  /// Send the request to the parent process to get the value of the metric.
+  /// \return Returns the value of the metric.
+  double SendGetValueRequest();
+
+  /// Throws an exception if the metric has been cleared. This check is to avoid
+  /// the user error where the corresponding metric family has been deleted
+  /// before the metric is deleted.
+  void CheckIfCleared();
+#else
+  // Initialize the TRITONSERVER_Metric object.
+  /// \return Returns the address of the TRITONSERVER_Metric object.
+  void* InitializeTritonMetric();
+
+  /// Parse the labels string into a vector of TRITONSERVER_Parameter.
+  /// \param labels_params The vector of TRITONSERVER_Parameter to store the
+  /// parsed labels.
+  /// \param labels The labels string to parse.
+  void ParseLabels(
+      std::vector<const TRITONSERVER_Parameter*>& labels_params,
+      const std::string& labels);
+
+  /// Handle the metric operation.
+  /// \param metrics_message_ptr The pointer to the CustomMetricsMessage object.
+  void HandleMetricOperation(
+      CustomMetricsMessage* metrics_message_ptr,
+      const PYTHONSTUB_CommandType& command_type);
+
+  /// Use Triton C API to increment the value of the metric by the given value.
+  /// \param value The value to increment the metric by.
+  void Increment(const double& value);
+
+  /// Use Triton C API to set the value of the metric to the given value.
+  /// \param value The value to set the metric to.
+  void SetValue(const double& value);
+
+  /// Use Triton C API to sample the observation to the metric.
+  /// \param value The value to sample observation to the metric.
+  void Observe(const double& value);
+
+  /// Use Triton C API to get the value of the metric.
+  double GetValue();
+
+  /// Clear the TRITONSERVER_Metric object.
+  void ClearTritonMetric();
+#endif
+
+  /// Disallow copying the custom metric object.
+  DISALLOW_COPY_AND_ASSIGN(Metric);
+
+ private:
+  // The private constructor for creating a Metric object from shared memory.
+  Metric(
+      AllocatedSharedMemory<MetricShm>& custom_metric_shm,
+      std::unique_ptr<PbString>& labels_shm,
+      std::unique_ptr<PbMemory>& buckets);
+
+  // The labels of the metric, which is the identifier of the metric.
+  std::string labels_;
+  // Monotonically increasing values representing bucket boundaries for creating
+  // histogram metric.
+  std::optional<std::vector<double>> buckets_;
+  // The value used for incrementing or setting the metric.
+  double operation_value_;
+  // The address of the TRITONSERVER_Metric object.
+  void* metric_address_;
+  // The address corresponds to the TRITONSERVER_MetricFamily object that this
+  // metric belongs to.
+  void* metric_family_address_;
+  // Indicates whether the metric has been cleared. It is needed as the Clear()'
+  // function can be called from two different locations: when the metric family
+  // clears the 'metric_map_' and when the 'Metric' object goes out of
+  // scope/being deleted.
+  bool is_cleared_;
+
+  // Shared Memory Data Structures
+  AllocatedSharedMemory<MetricShm> custom_metric_shm_;
+  MetricShm* custom_metric_shm_ptr_;
+  bi::managed_external_buffer::handle_t shm_handle_;
+  std::unique_ptr<PbString> labels_shm_;
+  std::unique_ptr<PbMemory> buckets_shm_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/metric_family.cc b/src/metric_family.cc
new file mode 100644
index 00000000..222a0e23
--- /dev/null
+++ b/src/metric_family.cc
@@ -0,0 +1,248 @@
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "metric_family.h"
+
+#ifdef TRITON_PB_STUB
+#include "pb_stub.h"
+#endif
+
+namespace triton { namespace backend { namespace python {
+
+MetricFamily::MetricFamily(
+    const std::string& name, const std::string& description,
+    const MetricKind& kind)
+    : name_(name), description_(description), kind_(kind),
+      metric_family_address_(nullptr)
+{
+#ifdef TRITON_PB_STUB
+  SendCreateMetricFamilyRequest();
+#endif
+}
+
+MetricFamily::~MetricFamily()
+{
+#ifdef TRITON_PB_STUB
+  // Clear all the metrics first
+  {
+    std::lock_guard<std::mutex> lock(metric_map_mu_);
+    for (auto& m : metric_map_) {
+      m.second->Clear();
+    }
+  }
+
+  // Send the request to delete the MetricFamily to the parent process
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  SaveToSharedMemory(stub->ShmPool());
+  AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
+  try {
+    stub->SendMessage<CustomMetricsMessage>(
+        custom_metrics_shm, PYTHONSTUB_MetricFamilyRequestDelete, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    std::cerr << "Error when deleting MetricFamily: " << pb_exception.what()
+              << "\n";
+  }
+#endif
+};
+
+void
+MetricFamily::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  AllocatedSharedMemory<MetricFamilyShm> custom_metric_family_shm =
+      shm_pool->Construct<MetricFamilyShm>();
+
+  custom_metric_family_shm_ptr_ = custom_metric_family_shm.data_.get();
+  std::unique_ptr<PbString> name_shm = PbString::Create(shm_pool, name_);
+  std::unique_ptr<PbString> description_shm =
+      PbString::Create(shm_pool, description_);
+
+  custom_metric_family_shm_ptr_->kind = kind_;
+  custom_metric_family_shm_ptr_->name_shm_handle = name_shm->ShmHandle();
+  custom_metric_family_shm_ptr_->description_shm_handle =
+      description_shm->ShmHandle();
+  custom_metric_family_shm_ptr_->metric_family_address = metric_family_address_;
+
+  // Save the references to shared memory.
+  custom_metric_family_shm_ = std::move(custom_metric_family_shm);
+  name_shm_ = std::move(name_shm);
+  description_shm_ = std::move(description_shm);
+  shm_handle_ = custom_metric_family_shm_.handle_;
+}
+
+std::unique_ptr<MetricFamily>
+MetricFamily::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle)
+{
+  AllocatedSharedMemory<MetricFamilyShm> custom_metric_family_shm =
+      shm_pool->Load<MetricFamilyShm>(handle);
+  MetricFamilyShm* custom_metric_family_shm_ptr =
+      custom_metric_family_shm.data_.get();
+  std::unique_ptr<PbString> name_shm = PbString::LoadFromSharedMemory(
+      shm_pool, custom_metric_family_shm_ptr->name_shm_handle);
+  std::unique_ptr<PbString> description_shm = PbString::LoadFromSharedMemory(
+      shm_pool, custom_metric_family_shm_ptr->description_shm_handle);
+
+  return std::unique_ptr<MetricFamily>(
+      new MetricFamily(custom_metric_family_shm, name_shm, description_shm));
+}
+
+MetricFamily::MetricFamily(
+    AllocatedSharedMemory<MetricFamilyShm>& custom_metric_family_shm,
+    std::unique_ptr<PbString>& name_shm,
+    std::unique_ptr<PbString>& description_shm)
+    : custom_metric_family_shm_(std::move(custom_metric_family_shm)),
+      name_shm_(std::move(name_shm)),
+      description_shm_(std::move(description_shm))
+{
+  custom_metric_family_shm_ptr_ = custom_metric_family_shm_.data_.get();
+  name_ = name_shm_->String();
+  description_ = description_shm_->String();
+  kind_ = custom_metric_family_shm_ptr_->kind;
+  metric_family_address_ = custom_metric_family_shm_ptr_->metric_family_address;
+}
+
+void*
+MetricFamily::MetricFamilyAddress()
+{
+  return metric_family_address_;
+}
+
+#ifdef TRITON_PB_STUB
+std::shared_ptr<MetricFamily>
+MetricFamily::CreateMetricFamily(
+    const std::string& name, const std::string& description,
+    const MetricKind& kind)
+{
+  std::shared_ptr<MetricFamily> metric_family =
+      std::make_shared<MetricFamily>(name, description, kind);
+  metric_family->SendCreateMetricFamilyRequest();
+  return metric_family;
+}
+
+void
+MetricFamily::SendCreateMetricFamilyRequest()
+{
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  SaveToSharedMemory(stub->ShmPool());
+  CustomMetricsMessage* custom_metrics_msg = nullptr;
+  AllocatedSharedMemory<CustomMetricsMessage> custom_metrics_shm;
+  try {
+    stub->SendMessage<CustomMetricsMessage>(
+        custom_metrics_shm, PYTHONSTUB_MetricFamilyRequestNew, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    throw PythonBackendException(
+        "Error when creating MetricFamily: " +
+        std::string(pb_exception.what()));
+  }
+
+  custom_metrics_msg = custom_metrics_shm.data_.get();
+  metric_family_address_ = custom_metrics_msg->address;
+}
+
+std::shared_ptr<Metric>
+MetricFamily::CreateMetric(const py::object& labels, const py::object& buckets)
+{
+  if (!labels.is_none()) {
+    if (!py::isinstance<py::dict>(labels)) {
+      throw PythonBackendException(
+          "Failed to create metric. Labels must be a dictionary.");
+    }
+  }
+
+  py::module json = py::module_::import("json");
+  std::string labels_str = std::string(py::str(json.attr("dumps")(labels)));
+
+  std::optional<std::vector<double>> buckets_vec;
+  if (!buckets.is_none()) {
+    if (!py::isinstance<py::list>(buckets)) {
+      throw PythonBackendException(
+          "Failed to create metric. Buckets must be a list.");
+    }
+    if (kind_ == kCounter || kind_ == kGauge) {
+      throw PythonBackendException(
+          "Failed to create metric. Unexpected buckets found.");
+    }
+    buckets_vec = buckets.cast<std::vector<double>>();
+  } else {
+    if (kind_ == kHistogram) {
+      throw PythonBackendException(
+          "Failed to create metric. Missing required buckets.");
+    }
+    buckets_vec = std::nullopt;
+  }
+
+  auto metric =
+      std::make_shared<Metric>(labels_str, buckets_vec, metric_family_address_);
+  {
+    std::lock_guard<std::mutex> lock(metric_map_mu_);
+    metric_map_.insert({metric->MetricAddress(), metric});
+  }
+
+  return metric;
+}
+#else
+void*
+MetricFamily::InitializeTritonMetricFamily()
+{
+  TRITONSERVER_MetricKind triton_kind = ToTritonServerMetricKind(kind_);
+  TRITONSERVER_MetricFamily* triton_metric_family = nullptr;
+  THROW_IF_TRITON_ERROR(TRITONSERVER_MetricFamilyNew(
+      &triton_metric_family, triton_kind, name_.c_str(), description_.c_str()));
+  return reinterpret_cast<void*>(triton_metric_family);
+}
+
+TRITONSERVER_MetricKind
+MetricFamily::ToTritonServerMetricKind(const MetricKind& kind)
+{
+  switch (kind) {
+    case kCounter:
+      return TRITONSERVER_METRIC_KIND_COUNTER;
+    case kGauge:
+      return TRITONSERVER_METRIC_KIND_GAUGE;
+    case kHistogram:
+      return TRITONSERVER_METRIC_KIND_HISTOGRAM;
+    default:
+      throw PythonBackendException("Unknown metric kind");
+  }
+}
+
+void
+MetricFamily::ClearTritonMetricFamily()
+{
+  auto metric_family =
+      reinterpret_cast<TRITONSERVER_MetricFamily*>(metric_family_address_);
+  if (metric_family != nullptr) {
+    LOG_IF_ERROR(
+        TRITONSERVER_MetricFamilyDelete(metric_family),
+        "deleting metric family");
+  }
+}
+#endif
+
+}}}  // namespace triton::backend::python
diff --git a/src/metric_family.h b/src/metric_family.h
new file mode 100644
index 00000000..2b5f86ab
--- /dev/null
+++ b/src/metric_family.h
@@ -0,0 +1,154 @@
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <string>
+
+#include "ipc_message.h"
+#include "metric.h"
+#include "pb_string.h"
+#include "pb_utils.h"
+
+#ifdef TRITON_PB_STUB
+#include <pybind11/embed.h>
+namespace py = pybind11;
+#else
+#include "triton/core/tritonserver.h"
+#endif
+
+namespace triton { namespace backend { namespace python {
+
+// The 'MetricFamilyShm' struct is utilized by the 'MetricFamily' class for
+// saving the essential data to shared memory and for loading the data from
+// shared memory in order to reconstruct the 'MetricFamily' object.
+struct MetricFamilyShm {
+  // The shared memory handle of the name in PbString format.
+  bi::managed_external_buffer::handle_t name_shm_handle;
+  // The shared memory handle of the description in PbString format.
+  bi::managed_external_buffer::handle_t description_shm_handle;
+  // The metric kind of the 'MetricFamily'.
+  MetricKind kind;
+  // The address of the 'TRITONSERVER_MetricFamily' object.
+  void* metric_family_address;
+};
+
+class MetricFamily {
+ public:
+  MetricFamily(
+      const std::string& name, const std::string& description,
+      const MetricKind& kind);
+
+  ~MetricFamily();
+
+  /// Save a custom metric family to shared memory.
+  /// \param shm_pool Shared memory pool to save the custom metric family.
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+
+  /// Create a Custom Metric Family object from shared memory.
+  /// \param shm_pool Shared memory pool
+  /// \param handle Shared memory handle of the custom metric family.
+  /// \return Returns the custom metric family in the specified handle
+  /// location.
+  static std::unique_ptr<MetricFamily> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+
+  /// Get the address of the TRITONSERVER_MetricFamily object.
+  /// \return Returns the address of the TRITONSERVER_MetricFamily object.
+  void* MetricFamilyAddress();
+
+#ifdef TRITON_PB_STUB
+  /// Create a metric family object and returned as a shared pointer.
+  /// \param name The name of the metric family.
+  /// \param description The description of the metric family.
+  /// \param kind The metric kind of the metric family.
+  /// \return Returns the shared pointer to the created metric family.
+  static std::shared_ptr<MetricFamily> CreateMetricFamily(
+      const std::string& name, const std::string& description,
+      const MetricKind& kind);
+
+  /// Send a request to register a new 'TRITONSERVER_MetricFamily' object to the
+  /// parent process.
+  void SendCreateMetricFamilyRequest();
+
+  /// Create a metric from the metric family and store it in the metric map.
+  /// \param labels The labels of the metric.
+  /// \param buckets Monotonically increasing values representing bucket
+  /// boundaries for creating histogram metric.
+  /// \return Returns the shared pointer to the created metric.
+  std::shared_ptr<Metric> CreateMetric(
+      const py::object& labels, const py::object& buckets);
+#else
+  /// Initialize the TRITONSERVER_MetricFamily object.
+  /// \return Returns the address of the TRITONSERVER_MetricFamily object.
+  void* InitializeTritonMetricFamily();
+
+  /// Helper function to convert the MetricKind enum to TRITONSERVER_MetricKind
+  /// \param kind The MetricKind enum to be converted.
+  /// \return Returns the TRITONSERVER_MetricKind enum.
+  TRITONSERVER_MetricKind ToTritonServerMetricKind(const MetricKind& kind);
+
+  /// Clear the TRITONSERVER_MetricFamily object.
+  void ClearTritonMetricFamily();
+#endif
+
+  /// Disallow copying the metric family object.
+  DISALLOW_COPY_AND_ASSIGN(MetricFamily);
+
+ private:
+  // The private constructor for creating a MetricFamily object from shared
+  // memory.
+  MetricFamily(
+      AllocatedSharedMemory<MetricFamilyShm>& custom_metric_family_shm,
+      std::unique_ptr<PbString>& name_shm,
+      std::unique_ptr<PbString>& description_shm);
+
+  // The name of the metric family.
+  std::string name_;
+  // The description of the metric family.
+  std::string description_;
+  // The metric kind of the metric family. Currently only supports GAUGE,
+  // COUNTER and HISTOGRAM.
+  MetricKind kind_;
+  // The address of the TRITONSERVER_MetricFamily object.
+  void* metric_family_address_;
+
+  // The mutex to protect the 'metric_map_'.
+  std::mutex metric_map_mu_;
+  // Need to keep track of the metrics associated with the metric family to make
+  // sure the metrics are cleaned up before the metric family is deleted.
+  std::unordered_map<void*, std::shared_ptr<Metric>> metric_map_;
+
+  // Shared Memory Data Structures
+  AllocatedSharedMemory<MetricFamilyShm> custom_metric_family_shm_;
+  MetricFamilyShm* custom_metric_family_shm_ptr_;
+  bi::managed_external_buffer::handle_t shm_handle_;
+  std::unique_ptr<PbString> name_shm_;
+  std::unique_ptr<PbString> description_shm_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/model_loader.cc b/src/model_loader.cc
new file mode 100644
index 00000000..0be45fa5
--- /dev/null
+++ b/src/model_loader.cc
@@ -0,0 +1,267 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include "model_loader.h"
+
+#ifdef TRITON_PB_STUB
+#include "pb_stub.h"
+#endif
+
+namespace triton { namespace backend { namespace python {
+
+void
+ModelLoader::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  AllocatedSharedMemory<ModelLoaderRequestShm> model_loader_req_shm =
+      shm_pool->Construct<ModelLoaderRequestShm>();
+  model_loader_req_shm_ptr_ = model_loader_req_shm.data_.get();
+
+  std::unique_ptr<PbString> name_shm = PbString::Create(shm_pool, name_);
+  std::unique_ptr<PbString> version_shm = PbString::Create(shm_pool, version_);
+  std::unique_ptr<PbString> config_shm = PbString::Create(shm_pool, config_);
+  std::unique_ptr<PbMap> files_shm = PbMap::Create(shm_pool, files_);
+
+  model_loader_req_shm_ptr_->name_shm_handle = name_shm->ShmHandle();
+  model_loader_req_shm_ptr_->version_shm_handle = version_shm->ShmHandle();
+  model_loader_req_shm_ptr_->config_shm_handle = config_shm->ShmHandle();
+  model_loader_req_shm_ptr_->files_shm_handle = files_shm->ShmHandle();
+  model_loader_req_shm_ptr_->unload_dependents = unload_dependents_;
+
+  // Save the references to shared memory.
+  model_loader_req_shm_ = std::move(model_loader_req_shm);
+  name_shm_ = std::move(name_shm);
+  version_shm_ = std::move(version_shm);
+  config_shm_ = std::move(config_shm);
+  files_shm_ = std::move(files_shm);
+
+  shm_handle_ = model_loader_req_shm_.handle_;
+}
+
+std::unique_ptr<ModelLoader>
+ModelLoader::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle)
+{
+  AllocatedSharedMemory<ModelLoaderRequestShm> model_loader_req_shm =
+      shm_pool->Load<ModelLoaderRequestShm>(handle);
+  ModelLoaderRequestShm* model_loader_req_shm_ptr =
+      model_loader_req_shm.data_.get();
+
+  std::unique_ptr<PbString> name_shm = PbString::LoadFromSharedMemory(
+      shm_pool, model_loader_req_shm_ptr->name_shm_handle);
+  std::unique_ptr<PbString> version_shm = PbString::LoadFromSharedMemory(
+      shm_pool, model_loader_req_shm_ptr->version_shm_handle);
+  std::unique_ptr<PbString> config_shm = PbString::LoadFromSharedMemory(
+      shm_pool, model_loader_req_shm_ptr->config_shm_handle);
+  std::unique_ptr<PbMap> files_shm = PbMap::LoadFromSharedMemory(
+      shm_pool, model_loader_req_shm_ptr->files_shm_handle);
+
+  return std::unique_ptr<ModelLoader>(new ModelLoader(
+      model_loader_req_shm, name_shm, version_shm, config_shm, files_shm));
+}
+
+ModelLoader::ModelLoader(
+    AllocatedSharedMemory<ModelLoaderRequestShm>& model_loader_req_shm,
+    std::unique_ptr<PbString>& name_shm, std::unique_ptr<PbString>& version_shm,
+    std::unique_ptr<PbString>& config_shm, std::unique_ptr<PbMap>& files_shm)
+    : model_loader_req_shm_(std::move(model_loader_req_shm)),
+      name_shm_(std::move(name_shm)), version_shm_(std::move(version_shm)),
+      config_shm_(std::move(config_shm)), files_shm_(std::move(files_shm))
+{
+  model_loader_req_shm_ptr_ = model_loader_req_shm_.data_.get();
+  name_ = name_shm_->String();
+  version_ = version_shm_->String();
+  config_ = config_shm_->String();
+  files_ = files_shm_->UnorderedMap();
+  unload_dependents_ = model_loader_req_shm_ptr_->unload_dependents;
+}
+#ifdef TRITON_PB_STUB
+void
+ModelLoader::SendLoadModelRequest()
+{
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  SaveToSharedMemory(stub->ShmPool());
+  AllocatedSharedMemory<ModelLoaderMessage> model_loader_msg_shm;
+
+  try {
+    stub->SendMessage<ModelLoaderMessage>(
+        model_loader_msg_shm, PYTHONSTUB_LoadModelRequest, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    throw PythonBackendException(
+        "Failed to load model: " + std::string(pb_exception.what()));
+  }
+}
+
+void
+ModelLoader::SendUnloadModelRequest()
+{
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  SaveToSharedMemory(stub->ShmPool());
+  AllocatedSharedMemory<ModelLoaderMessage> model_loader_msg_shm;
+  try {
+    stub->SendMessage<ModelLoaderMessage>(
+        model_loader_msg_shm, PYTHONSTUB_UnloadModelRequest, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    throw PythonBackendException(
+        "Failed to unload model: " + std::string(pb_exception.what()));
+  }
+}
+
+bool
+ModelLoader::SendModelReadinessRequest()
+{
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  SaveToSharedMemory(stub->ShmPool());
+  ModelLoaderMessage* model_loader_msg = nullptr;
+  AllocatedSharedMemory<ModelLoaderMessage> model_loader_msg_shm;
+  try {
+    stub->SendMessage<ModelLoaderMessage>(
+        model_loader_msg_shm, PYTHONSTUB_ModelReadinessRequest, shm_handle_);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    throw PythonBackendException(
+        "Failed to check model readiness: " + std::string(pb_exception.what()));
+  }
+
+  model_loader_msg = model_loader_msg_shm.data_.get();
+  return model_loader_msg->is_model_ready;
+}
+
+void
+LoadModel(
+    const std::string& name, const std::string& config, const py::object& files)
+{
+  std::unordered_map<std::string, std::string> files_map;
+
+  if (!files.is_none()) {
+    if (!py::isinstance<py::dict>(files)) {
+      throw PythonBackendException(
+          "failed to load model '" + name +
+          "', files should be a dictionary of file paths and file contents");
+    }
+
+    py::dict files_dict = py::cast<py::dict>(files);
+    for (const auto& item : files_dict) {
+      std::string key = py::cast<std::string>(item.first);
+      py::bytes value = py::cast<py::bytes>(item.second);
+      std::string content(value);
+      files_map[key] = content;
+    }
+  }
+
+  ModelLoader model_loader(name, config, files_map);
+  model_loader.SendLoadModelRequest();
+}
+
+void
+UnloadModel(const std::string& name, const bool unload_dependents)
+{
+  ModelLoader model_loader(name, unload_dependents);
+  model_loader.SendUnloadModelRequest();
+}
+
+bool
+IsModelReady(const std::string& name, const std::string& version)
+{
+  ModelLoader model_loader(name, version);
+  return model_loader.SendModelReadinessRequest();
+}
+#else
+void
+ModelLoader::LoadModel(TRITONSERVER_Server* server)
+{
+  std::string path = "";
+  std::string file_content = "";
+  std::vector<const TRITONSERVER_Parameter*> const_params;
+  if (!config_.empty()) {
+    const_params.emplace_back(TRITONSERVER_ParameterNew(
+        "config", TRITONSERVER_PARAMETER_STRING, config_.c_str()));
+  }
+  if (!files_.empty()) {
+    for (auto& file : files_) {
+      path = file.first;
+      file_content = file.second;
+      const_params.emplace_back(TRITONSERVER_ParameterBytesNew(
+          path.c_str(), file_content.data(), file_content.size()));
+    }
+  }
+
+  THROW_IF_TRITON_ERROR(TRITONSERVER_ServerLoadModelWithParameters(
+      server, name_.c_str(), const_params.data(), const_params.size()));
+
+  for (const auto param : const_params) {
+    TRITONSERVER_ParameterDelete(const_cast<TRITONSERVER_Parameter*>(param));
+  }
+}
+
+void
+ModelLoader::UnloadModel(TRITONSERVER_Server* server)
+{
+  if (unload_dependents_) {
+    THROW_IF_TRITON_ERROR(
+        TRITONSERVER_ServerUnloadModelAndDependents(server, name_.c_str()));
+  } else {
+    THROW_IF_TRITON_ERROR(
+        TRITONSERVER_ServerUnloadModel(server, name_.c_str()));
+  }
+}
+
+bool
+ModelLoader::IsModelReady(TRITONSERVER_Server* server)
+{
+  bool is_ready = false;
+  int64_t model_version = GetModelVersionFromString(version_);
+  THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelIsReady(
+      server, name_.c_str(), model_version, &is_ready));
+  return is_ready;
+}
+
+int64_t
+ModelLoader::GetModelVersionFromString(const std::string& version_string)
+{
+  int64_t version = -1;
+  if (!version_string.empty()) {
+    try {
+      version = std::stol(version_string);
+    }
+    catch (std::exception& e) {
+      throw PythonBackendException(
+          "failed to get model version from specified version string '" +
+          version_string + "' (details: " + e.what() +
+          "), version should be an integral value > 0");
+    }
+
+    if (version < 0) {
+      throw PythonBackendException(
+          "failed to get model version from specified version string '" +
+          version_string + "', version should be an integral value > 0");
+    }
+  }
+  return version;
+}
+#endif
+}}}  // namespace triton::backend::python
diff --git a/src/model_loader.h b/src/model_loader.h
new file mode 100644
index 00000000..e4fe9fd6
--- /dev/null
+++ b/src/model_loader.h
@@ -0,0 +1,165 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "ipc_message.h"
+#include "pb_map.h"
+#include "pb_string.h"
+#include "pb_utils.h"
+
+#ifdef TRITON_PB_STUB
+#include <pybind11/embed.h>
+namespace py = pybind11;
+#else
+#include "triton/core/tritonserver.h"
+#endif
+
+namespace triton { namespace backend { namespace python {
+
+// The 'ModelLoaderRequestShm' struct is utilized by the 'ModelLoader' class for
+// saving the essential data to shared memory and for loading the data from
+// shared memory in order to reconstruct the 'ModelLoader' object.
+struct ModelLoaderRequestShm {
+  // The shared memory handle of the model name in PbString format.
+  bi::managed_external_buffer::handle_t name_shm_handle;
+  // The shared memory handle of the model version in PbString format.
+  bi::managed_external_buffer::handle_t version_shm_handle;
+  // The flag to unload the dependent models.
+  bool unload_dependents;
+  // The shared memory handle of the config in PbString format.
+  bi::managed_external_buffer::handle_t config_shm_handle;
+  // The shared memory handle of the files in PbMap format.
+  bi::managed_external_buffer::handle_t files_shm_handle;
+};
+
+class ModelLoader {
+ public:
+  ModelLoader(
+      const std::string& name, const std::string& config,
+      const std::unordered_map<std::string, std::string>& files)
+      : name_(name), version_(""), config_(config), files_(files),
+        unload_dependents_(false)
+  {
+  }
+
+  ModelLoader(const std::string& name, const bool unload_dependents)
+      : name_(name), version_(""), config_(""), files_({}),
+        unload_dependents_(unload_dependents)
+  {
+  }
+
+  ModelLoader(const std::string& name, const std::string& version)
+      : name_(name), version_(version), config_(""), files_({}),
+        unload_dependents_(false)
+  {
+  }
+
+  /// Save ModelLoader object to shared memory.
+  /// \param shm_pool Shared memory pool to save the ModelLoader object.
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+
+  /// Create a ModelLoader object from shared memory.
+  /// \param shm_pool Shared memory pool
+  /// \param handle Shared memory handle of the ModelLoader.
+  /// \return Returns the ModelLoaders in the specified request_handle
+  /// location.
+  static std::unique_ptr<ModelLoader> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+#ifdef TRITON_PB_STUB
+  /// Send a request to load the model.
+  void SendLoadModelRequest();
+
+  /// Send a request to unload the model.
+  void SendUnloadModelRequest();
+
+  /// Send a request to check if the model is ready.
+  bool SendModelReadinessRequest();
+#else
+  /// Use Triton C API to load the model.
+  /// \param server The Triton server object.
+  void LoadModel(TRITONSERVER_Server* server);
+
+  /// Use Triton C API to unload the model.
+  /// \param server The Triton server object.
+  void UnloadModel(TRITONSERVER_Server* server);
+
+  /// Use Triton C API to check if the model is ready.
+  /// \param server The Triton server object.
+  /// \return Returns true if the model is ready.
+  bool IsModelReady(TRITONSERVER_Server* server);
+
+  /// Get the model version from the version string.
+  /// \param version_string The version string.
+  /// \return Returns the model version in uint64_t.
+  int64_t GetModelVersionFromString(const std::string& version_string);
+#endif
+  /// Disallow copying the ModelLoader object.
+  DISALLOW_COPY_AND_ASSIGN(ModelLoader);
+
+ private:
+  // The private constructor for creating a Metric object from shared memory.
+  ModelLoader(
+      AllocatedSharedMemory<ModelLoaderRequestShm>& model_loader_req_shm,
+      std::unique_ptr<PbString>& name_shm,
+      std::unique_ptr<PbString>& version_shm,
+      std::unique_ptr<PbString>& config_shm, std::unique_ptr<PbMap>& files_shm);
+
+  // The name of the model.
+  std::string name_;
+  // The version of the model.
+  std::string version_;
+  // The configuration of the model.
+  std::string config_;
+  // The files of the model.
+  std::unordered_map<std::string, std::string> files_;
+  // The flag to unload the dependent models.
+  bool unload_dependents_;
+
+  // // Shared Memory Data Structures
+  AllocatedSharedMemory<ModelLoaderRequestShm> model_loader_req_shm_;
+  ModelLoaderRequestShm* model_loader_req_shm_ptr_;
+  bi::managed_external_buffer::handle_t shm_handle_;
+  std::unique_ptr<PbString> name_shm_;
+  std::unique_ptr<PbString> version_shm_;
+  std::unique_ptr<PbString> config_shm_;
+  std::unique_ptr<PbMap> files_shm_;
+};
+
+#ifdef TRITON_PB_STUB
+// The binding functions for the Python stub.
+void LoadModel(
+    const std::string& name, const std::string& config,
+    const py::object& files = py::none());
+void UnloadModel(const std::string& name, const bool unload_dependents);
+bool IsModelReady(const std::string& name, const std::string& version);
+#endif
+
+}}};  // namespace triton::backend::python
diff --git a/src/pb_bls_cancel.cc b/src/pb_bls_cancel.cc
new file mode 100644
index 00000000..4341c037
--- /dev/null
+++ b/src/pb_bls_cancel.cc
@@ -0,0 +1,93 @@
+// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_bls_cancel.h"
+
+#include "pb_stub.h"
+#include "pb_stub_log.h"
+
+namespace triton { namespace backend { namespace python {
+
+void
+PbBLSCancel::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  cancel_shm_ = shm_pool->Construct<CancelBLSRequestMessage>();
+  new (&(cancel_shm_.data_->mu)) bi::interprocess_mutex;
+  new (&(cancel_shm_.data_->cv)) bi::interprocess_condition;
+  cancel_shm_.data_->waiting_on_stub = false;
+  cancel_shm_.data_->infer_payload_id = infer_playload_id_;
+  cancel_shm_.data_->is_cancelled = is_cancelled_;
+}
+
+bi::managed_external_buffer::handle_t
+PbBLSCancel::ShmHandle()
+{
+  return cancel_shm_.handle_;
+}
+
+CancelBLSRequestMessage*
+PbBLSCancel::ShmPayload()
+{
+  return cancel_shm_.data_.get();
+}
+
+void
+PbBLSCancel::Cancel()
+{
+  // Release the GIL. Python objects are not accessed during the check.
+  py::gil_scoped_release gil_release;
+
+  std::unique_lock<std::mutex> lk(mu_);
+  // The cancelled flag can only move from false to true, not the other way, so
+  // it is checked on each query until cancelled and then implicitly cached.
+  if (is_cancelled_) {
+    return;
+  }
+  if (!updating_) {
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    if (!stub->StubToParentServiceActive()) {
+      LOG_ERROR << "Cannot communicate with parent service";
+      return;
+    }
+
+    stub->EnqueueCancelBLSRequest(this);
+    updating_ = true;
+  }
+  cv_.wait(lk, [this] { return !updating_; });
+}
+
+void
+PbBLSCancel::ReportIsCancelled(bool is_cancelled)
+{
+  {
+    std::lock_guard<std::mutex> lk(mu_);
+    is_cancelled_ = is_cancelled;
+    updating_ = false;
+  }
+  cv_.notify_all();
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_bls_cancel.h b/src/pb_bls_cancel.h
new file mode 100644
index 00000000..7fdd3fbf
--- /dev/null
+++ b/src/pb_bls_cancel.h
@@ -0,0 +1,63 @@
+// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+
+class PbBLSCancel {
+ public:
+  PbBLSCancel(void* infer_playload_id)
+      : updating_(false), infer_playload_id_(infer_playload_id),
+        is_cancelled_(false)
+  {
+  }
+  DISALLOW_COPY_AND_ASSIGN(PbBLSCancel);
+
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+  bi::managed_external_buffer::handle_t ShmHandle();
+  CancelBLSRequestMessage* ShmPayload();
+
+  void Cancel();
+  void ReportIsCancelled(bool is_cancelled);
+
+ private:
+  AllocatedSharedMemory<CancelBLSRequestMessage> cancel_shm_;
+
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool updating_;
+
+  void* infer_playload_id_;
+  bool is_cancelled_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc
new file mode 100644
index 00000000..da9daf98
--- /dev/null
+++ b/src/pb_cancel.cc
@@ -0,0 +1,94 @@
+// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_cancel.h"
+
+#include "pb_stub.h"
+#include "pb_stub_log.h"
+
+namespace triton { namespace backend { namespace python {
+
+void
+PbCancel::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  cancel_shm_ = shm_pool->Construct<IsCancelledMessage>();
+  new (&(cancel_shm_.data_->mu)) bi::interprocess_mutex;
+  new (&(cancel_shm_.data_->cv)) bi::interprocess_condition;
+  cancel_shm_.data_->waiting_on_stub = false;
+  cancel_shm_.data_->response_factory_address = response_factory_address_;
+  cancel_shm_.data_->request_address = request_address_;
+  cancel_shm_.data_->is_cancelled = is_cancelled_;
+}
+
+bi::managed_external_buffer::handle_t
+PbCancel::ShmHandle()
+{
+  return cancel_shm_.handle_;
+}
+
+IsCancelledMessage*
+PbCancel::ShmPayload()
+{
+  return cancel_shm_.data_.get();
+}
+
+bool
+PbCancel::IsCancelled()
+{
+  // Release the GIL. Python objects are not accessed during the check.
+  py::gil_scoped_release gil_release;
+
+  std::unique_lock<std::mutex> lk(mu_);
+  // The cancelled flag can only move from false to true, not the other way, so
+  // it is checked on each query until cancelled and then implicitly cached.
+  if (is_cancelled_) {
+    return is_cancelled_;
+  }
+  if (!updating_) {
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    if (!stub->StubToParentServiceActive()) {
+      LOG_ERROR << "Cannot communicate with parent service";
+      return false;
+    }
+    stub->EnqueueIsCancelled(this);
+    updating_ = true;
+  }
+  cv_.wait(lk, [this] { return !updating_; });
+  return is_cancelled_;
+}
+
+void
+PbCancel::ReportIsCancelled(bool is_cancelled)
+{
+  {
+    std::lock_guard<std::mutex> lk(mu_);
+    is_cancelled_ = is_cancelled;
+    updating_ = false;
+  }
+  cv_.notify_all();
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_cancel.h b/src/pb_cancel.h
new file mode 100644
index 00000000..3ebf07b5
--- /dev/null
+++ b/src/pb_cancel.h
@@ -0,0 +1,64 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+
+class PbCancel {
+ public:
+  PbCancel(intptr_t response_factory_address, intptr_t request_address)
+      : updating_(false), response_factory_address_(response_factory_address),
+        request_address_(request_address), is_cancelled_(false)
+  {
+  }
+  DISALLOW_COPY_AND_ASSIGN(PbCancel);
+
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+  bi::managed_external_buffer::handle_t ShmHandle();
+  IsCancelledMessage* ShmPayload();
+
+  bool IsCancelled();
+  void ReportIsCancelled(bool is_cancelled);
+
+ private:
+  AllocatedSharedMemory<IsCancelledMessage> cancel_shm_;
+
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool updating_;
+
+  intptr_t response_factory_address_;
+  intptr_t request_address_;
+  bool is_cancelled_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/pb_env.cc b/src/pb_env.cc
index 4d09a0ce..d9643a62 100644
--- a/src/pb_env.cc
+++ b/src/pb_env.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -26,17 +26,157 @@
 
 #include "pb_env.h"
 
+#ifndef _WIN32
 #include <archive.h>
 #include <archive_entry.h>
 #include <fts.h>
+#endif
+#include <sys/stat.h>
+
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
+
 #include "pb_utils.h"
 
 
 namespace triton { namespace backend { namespace python {
 
+bool
+FileExists(std::string& path)
+{
+  struct stat buffer;
+  return stat(path.c_str(), &buffer) == 0;
+}
+
+void
+LastModifiedTime(const std::string& path, time_t* last_modified_time)
+{
+  struct stat result;
+  if (stat(path.c_str(), &result) == 0) {
+    *last_modified_time = result.st_mtime;
+  } else {
+    throw PythonBackendException(std::string(
+        "LastModifiedTime() failed as file \'" + path +
+        std::string("\' does not exists.")));
+  }
+}
+
+// FIXME: [DLIS-5969]: Develop platforom-agnostic functions
+// to support custom python environments.
+#ifndef _WIN32
+void
+CopySingleArchiveEntry(archive* input_archive, archive* output_archive)
+{
+  const void* buff;
+  size_t size;
+#if ARCHIVE_VERSION_NUMBER >= 3000000
+  int64_t offset;
+#else
+  off_t offset;
+#endif
+
+  for (;;) {
+    int return_status;
+    return_status =
+        archive_read_data_block(input_archive, &buff, &size, &offset);
+    if (return_status == ARCHIVE_EOF)
+      break;
+    if (return_status != ARCHIVE_OK)
+      throw PythonBackendException(
+          "archive_read_data_block() failed with error code = " +
+          std::to_string(return_status));
+
+    return_status =
+        archive_write_data_block(output_archive, buff, size, offset);
+    if (return_status != ARCHIVE_OK) {
+      throw PythonBackendException(
+          "archive_write_data_block() failed with error code = " +
+          std::to_string(return_status) + ", error message is " +
+          archive_error_string(output_archive));
+    }
+  }
+}
+
+void
+ExtractTarFile(std::string& archive_path, std::string& dst_path)
+{
+  char current_directory[PATH_MAX];
+  if (getcwd(current_directory, PATH_MAX) == nullptr) {
+    throw PythonBackendException(
+        (std::string("Failed to get the current working directory. Error: ") +
+         std::strerror(errno)));
+  }
+  if (chdir(dst_path.c_str()) == -1) {
+    throw PythonBackendException(
+        (std::string("Failed to change the directory to ") + dst_path +
+         " Error: " + std::strerror(errno))
+            .c_str());
+  }
+
+  struct archive_entry* entry;
+  int flags = ARCHIVE_EXTRACT_TIME;
+
+  struct archive* input_archive = archive_read_new();
+  struct archive* output_archive = archive_write_disk_new();
+  archive_write_disk_set_options(output_archive, flags);
+
+  archive_read_support_filter_gzip(input_archive);
+  archive_read_support_format_tar(input_archive);
+
+  if (archive_path.size() == 0) {
+    throw PythonBackendException("The archive path is empty.");
+  }
+
+  THROW_IF_ERROR(
+      "archive_read_open_filename() failed.",
+      archive_read_open_filename(
+          input_archive, archive_path.c_str(), 10240 /* block_size */));
+
+  while (true) {
+    int read_status = archive_read_next_header(input_archive, &entry);
+    if (read_status == ARCHIVE_EOF)
+      break;
+    if (read_status != ARCHIVE_OK) {
+      throw PythonBackendException(
+          std::string("archive_read_next_header() failed with error code = ") +
+          std::to_string(read_status) + std::string(" error message is ") +
+          archive_error_string(input_archive));
+    }
+
+    read_status = archive_write_header(output_archive, entry);
+    if (read_status != ARCHIVE_OK) {
+      throw PythonBackendException(std::string(
+          "archive_write_header() failed with error code = " +
+          std::to_string(read_status) + std::string(" error message is ") +
+          archive_error_string(output_archive)));
+    }
+
+    CopySingleArchiveEntry(input_archive, output_archive);
+
+    read_status = archive_write_finish_entry(output_archive);
+    if (read_status != ARCHIVE_OK) {
+      throw PythonBackendException(std::string(
+          "archive_write_finish_entry() failed with error code = " +
+          std::to_string(read_status) + std::string(" error message is ") +
+          archive_error_string(output_archive)));
+    }
+  }
+
+  archive_read_close(input_archive);
+  archive_read_free(input_archive);
+
+  archive_write_close(output_archive);
+  archive_write_free(output_archive);
+
+  // Revert the directory change.
+  if (chdir(current_directory) == -1) {
+    throw PythonBackendException(
+        (std::string("Failed to change the directory to ") + current_directory)
+            .c_str());
+  }
+}
+
 void
 RecursiveDirectoryDelete(const char* dir)
 {
@@ -112,10 +252,54 @@ EnvironmentManager::ExtractIfNotExtracted(std::string env_path)
         std::string("Failed to get the canonical path for ") + env_path + ".");
   }
 
+  time_t last_modified_time;
+  LastModifiedTime(canonical_env_path, &last_modified_time);
+
+  bool env_extracted = false;
+  bool re_extraction = false;
+
+  // If the path is not a conda-packed file, then bypass the extraction process
+  struct stat info;
+  if (stat(canonical_env_path, &info) != 0) {
+    throw PythonBackendException(
+        std::string("stat() of : ") + canonical_env_path + " returned error.");
+  } else if (S_ISDIR(info.st_mode)) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("Returning canonical path since EXECUTION_ENV_PATH does "
+                     "not contain compressed path. Path: ") +
+         canonical_env_path)
+            .c_str());
+    return canonical_env_path;
+  }
+  const auto env_itr = env_map_.find(canonical_env_path);
+  if (env_itr != env_map_.end()) {
+    // Check if the environment has been modified and would
+    // need to be extracted again.
+    if (env_itr->second.second == last_modified_time) {
+      env_extracted = true;
+    } else {
+      // Environment file has been updated. Need to clear
+      // the previously extracted environment and extract
+      // the environment to the same destination directory.
+      RecursiveDirectoryDelete(env_itr->second.first.c_str());
+      re_extraction = true;
+    }
+  }
+
   // Extract only if the env has not been extracted yet.
-  if (env_map_.find(canonical_env_path) == env_map_.end()) {
-    std::string dst_env_path(
-        std::string(base_path_) + "/" + std::to_string(env_map_.size()));
+  if (!env_extracted) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("Extracting Python execution env ") + canonical_env_path)
+            .c_str());
+    std::string dst_env_path;
+    if (re_extraction) {
+      dst_env_path = env_map_[canonical_env_path].first;
+    } else {
+      dst_env_path =
+          std::string(base_path_) + "/" + std::to_string(env_map_.size());
+    }
 
     std::string canonical_env_path_str(canonical_env_path);
 
@@ -128,12 +312,16 @@ EnvironmentManager::ExtractIfNotExtracted(std::string env_path)
           std::string("Failed to create environment directory for '") +
           dst_env_path.c_str() + "'.");
     }
-
-    // Add the path to the list of environments
-    env_map_.insert({canonical_env_path, dst_env_path});
+    if (re_extraction) {
+      // Just update the last modified timestamp
+      env_map_[canonical_env_path].second = last_modified_time;
+    } else {
+      // Add the path to the list of environments
+      env_map_.insert({canonical_env_path, {dst_env_path, last_modified_time}});
+    }
     return dst_env_path;
   } else {
-    return env_map_.find(canonical_env_path)->second;
+    return env_map_.find(canonical_env_path)->second.first;
   }
 }
 
@@ -141,5 +329,6 @@ EnvironmentManager::~EnvironmentManager()
 {
   RecursiveDirectoryDelete(base_path_);
 }
+#endif
 
 }}}  // namespace triton::backend::python
diff --git a/src/pb_env.h b/src/pb_env.h
index 2b49e27c..04e01fa3 100644
--- a/src/pb_env.h
+++ b/src/pb_env.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -30,13 +30,23 @@
 #include <mutex>
 #include <string>
 
+#ifdef WIN32
+#include <windows.h>
+#undef PATH_MAX
+#define PATH_MAX MAX_PATH
+#endif
 namespace triton { namespace backend { namespace python {
 
+void ExtractTarFile(std::string& archive_path, std::string& dst_path);
+
+bool FileExists(std::string& path);
+
 //
 // A class that manages Python environments
 //
+#ifndef _WIN32
 class EnvironmentManager {
-  std::map<std::string, std::string> env_map_;
+  std::map<std::string, std::pair<std::string, time_t>> env_map_;
   char base_path_[PATH_MAX + 1];
   std::mutex mutex_;
 
@@ -48,5 +58,6 @@ class EnvironmentManager {
   std::string ExtractIfNotExtracted(std::string env_path);
   ~EnvironmentManager();
 };
+#endif
 
-}}}  // namespace triton::backend::python
\ No newline at end of file
+}}}  // namespace triton::backend::python
diff --git a/src/pb_error.cc b/src/pb_error.cc
new file mode 100644
index 00000000..0e5d0bd4
--- /dev/null
+++ b/src/pb_error.cc
@@ -0,0 +1,85 @@
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_error.h"
+
+namespace triton { namespace backend { namespace python {
+
+TRITONSERVER_Error_Code
+PbError::Code()
+{
+  return code_;
+}
+
+const std::string&
+PbError::Message()
+{
+  return message_;
+}
+
+bi::managed_external_buffer::handle_t
+PbError::ShmHandle()
+{
+  return shm_handle_;
+}
+
+void
+PbError::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
+{
+  message_shm_ = PbString::Create(shm_pool, message_);
+  error_shm_ = shm_pool->Construct<PbErrorShm>();
+  error_shm_.data_->code = code_;
+  error_shm_.data_->message_shm_handle = message_shm_->ShmHandle();
+  shm_handle_ = error_shm_.handle_;
+}
+
+std::shared_ptr<PbError>
+PbError::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t shm_handle)
+{
+  AllocatedSharedMemory<PbErrorShm> error_shm =
+      shm_pool->Load<PbErrorShm>(shm_handle);
+  std::unique_ptr<PbString> message_shm = PbString::LoadFromSharedMemory(
+      shm_pool, error_shm.data_->message_shm_handle);
+
+  TRITONSERVER_Error_Code code = error_shm.data_->code;
+  std::string message = message_shm->String();
+
+  return std::shared_ptr<PbError>(new PbError(
+      std::move(message_shm), std::move(error_shm), code, std::move(message)));
+}
+
+PbError::PbError(
+    std::shared_ptr<PbString>&& message_shm,
+    AllocatedSharedMemory<PbErrorShm>&& error_shm, TRITONSERVER_Error_Code code,
+    std::string&& message)
+    : message_shm_(std::move(message_shm)), error_shm_(std::move(error_shm)),
+      code_(code), message_(std::move(message))
+{
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_error.h b/src/pb_error.h
new file mode 100644
index 00000000..6001459a
--- /dev/null
+++ b/src/pb_error.h
@@ -0,0 +1,75 @@
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <string>
+
+#include "pb_string.h"
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+
+struct PbErrorShm {
+  TRITONSERVER_Error_Code code;
+  bi::managed_external_buffer::handle_t message_shm_handle;
+};
+
+class PbError {
+ public:
+  PbError(
+      const std::string& message,
+      TRITONSERVER_Error_Code code = TRITONSERVER_ERROR_INTERNAL)
+      : code_(code), message_(message)
+  {
+  }
+  DISALLOW_COPY_AND_ASSIGN(PbError);
+
+  TRITONSERVER_Error_Code Code();
+  const std::string& Message();
+
+  void SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool);
+  bi::managed_external_buffer::handle_t ShmHandle();
+
+  static std::shared_ptr<PbError> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+
+ private:
+  PbError(
+      std::shared_ptr<PbString>&& message_shm,
+      AllocatedSharedMemory<PbErrorShm>&& error_shm,
+      TRITONSERVER_Error_Code code, std::string&& message);
+
+  std::shared_ptr<PbString> message_shm_;
+  AllocatedSharedMemory<PbErrorShm> error_shm_;
+  bi::managed_external_buffer::handle_t shm_handle_;
+
+  TRITONSERVER_Error_Code code_;
+  std::string message_;
+};
+
+}}};  // namespace triton::backend::python
diff --git a/src/pb_exception.h b/src/pb_exception.h
new file mode 100644
index 00000000..6f96d02a
--- /dev/null
+++ b/src/pb_exception.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <exception>
+
+namespace triton { namespace backend { namespace python {
+
+//
+// PythonBackendException
+//
+// Exception thrown if error occurs in PythonBackend.
+//
+struct PythonBackendException : std::exception {
+  PythonBackendException(const std::string& message) : message_(message) {}
+
+  const char* what() const throw() { return message_.c_str(); }
+
+  std::string message_;
+};
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_log.cc b/src/pb_log.cc
new file mode 100644
index 00000000..629fb914
--- /dev/null
+++ b/src/pb_log.cc
@@ -0,0 +1,121 @@
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_log.h"
+
+namespace triton { namespace backend { namespace python {
+
+PbLog::PbLog(
+    const std::string& filename, uint32_t line, const std::string& message,
+    LogLevel level)
+    : filename_(filename), line_(line), message_(message), level_(level)
+{
+}
+
+const std::string&
+PbLog::Filename()
+{
+  return filename_;
+}
+const std::string&
+PbLog::Message()
+{
+  return message_;
+}
+const LogLevel&
+PbLog::Level()
+{
+  return level_;
+}
+const uint32_t&
+PbLog::Line()
+{
+  return line_;
+}
+
+PbLogShm::PbLogShm(
+    AllocatedSharedMemory<LogSendMessage>& log_container_shm,
+    std::unique_ptr<PbString>& filename, std::unique_ptr<PbString>& message)
+    : log_container_shm_(std::move(log_container_shm)),
+      filename_pb_string_(std::move(filename)),
+      message_pb_string_(std::move(message))
+{
+  log_container_shm_ptr_ = log_container_shm_.data_.get();
+  log_container_shm_ptr_->filename = filename_pb_string_->ShmHandle();
+  log_container_shm_ptr_->log_message = message_pb_string_->ShmHandle();
+}
+
+std::unique_ptr<PbLogShm>
+PbLogShm::Create(
+    std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& filename,
+    const uint32_t& line, const std::string& message, const LogLevel& level)
+{
+  std::unique_ptr<PbString> file_name = PbString::Create(shm_pool, filename);
+  std::unique_ptr<PbString> log_message = PbString::Create(shm_pool, message);
+  AllocatedSharedMemory<LogSendMessage> log_send_message =
+      shm_pool->Construct<LogSendMessage>();
+
+  LogSendMessage* send_message_payload = log_send_message.data_.get();
+  new (&(send_message_payload->mu)) bi::interprocess_mutex;
+  new (&(send_message_payload->cv)) bi::interprocess_condition;
+  send_message_payload->line = line;
+  send_message_payload->level = level;
+
+  return std::unique_ptr<PbLogShm>(
+      new PbLogShm(log_send_message, file_name, log_message));
+}
+
+std::unique_ptr<PbLog>
+PbLogShm::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle)
+{
+  AllocatedSharedMemory<LogSendMessage> log_container_shm =
+      shm_pool->Load<LogSendMessage>(handle);
+  std::unique_ptr<PbString> pb_string_filename = PbString::LoadFromSharedMemory(
+      shm_pool, log_container_shm.data_->filename);
+  const std::string& filename = pb_string_filename->String();
+  uint32_t line = log_container_shm.data_->line;
+  std::unique_ptr<PbString> pb_string_msg = PbString::LoadFromSharedMemory(
+      shm_pool, log_container_shm.data_->log_message);
+  const std::string& message = pb_string_msg->String();
+  LogLevel level = log_container_shm.data_->level;
+  return std::unique_ptr<PbLog>(new PbLog(filename, line, message, level));
+}
+
+bi::managed_external_buffer::handle_t
+PbLogShm::ShmHandle()
+{
+  return log_container_shm_.handle_;
+}
+
+LogSendMessage*
+PbLogShm::LogMessage()
+{
+  return log_container_shm_ptr_;
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_log.h b/src/pb_log.h
new file mode 100644
index 00000000..65d41009
--- /dev/null
+++ b/src/pb_log.h
@@ -0,0 +1,91 @@
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <string>
+
+#include "pb_string.h"
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+class PbLog {
+ public:
+  /// Create a PbLog instance
+  PbLog(
+      const std::string& filename, uint32_t line, const std::string& message,
+      LogLevel level);
+
+  /// Get the filename where the log was recorded
+  const std::string& Filename();
+
+  /// Get the log message
+  const std::string& Message();
+
+  /// Get the log level of the message
+  const LogLevel& Level();
+
+  /// Get the line number of the log message
+  const uint32_t& Line();
+
+ private:
+  std::string filename_;
+  uint32_t line_;
+  std::string message_;
+  LogLevel level_;
+};
+
+class PbLogShm {
+ public:
+  /// Save PbLog object to shared memory
+  static std::unique_ptr<PbLogShm> Create(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      const std::string& filename, const uint32_t& line,
+      const std::string& message, const LogLevel& level);
+
+  /// Load PbLog object to shared memory
+  static std::unique_ptr<PbLog> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+
+  /// Get the shared memory handle of the saved log message
+  bi::managed_external_buffer::handle_t ShmHandle();
+
+  /// Get a pointer to the saved log message
+  LogSendMessage* LogMessage();
+
+ private:
+  AllocatedSharedMemory<LogSendMessage> log_container_shm_;
+  std::unique_ptr<PbString> filename_pb_string_;
+  std::unique_ptr<PbString> message_pb_string_;
+
+  LogSendMessage* log_container_shm_ptr_;
+
+  PbLogShm(
+      AllocatedSharedMemory<LogSendMessage>& log_container_shm,
+      std::unique_ptr<PbString>& filename, std::unique_ptr<PbString>& message);
+};
+}}};  // namespace triton::backend::python
diff --git a/src/pb_map.cc b/src/pb_map.cc
new file mode 100644
index 00000000..a122db56
--- /dev/null
+++ b/src/pb_map.cc
@@ -0,0 +1,110 @@
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_map.h"
+
+namespace triton { namespace backend { namespace python {
+
+std::unique_ptr<PbMap>
+PbMap::Create(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    std::unordered_map<std::string, std::string>& map)
+{
+  std::vector<std::unique_ptr<PbString>> strings;
+  AllocatedSharedMemory<DictShm> dict_shm = shm_pool->Construct<DictShm>();
+  dict_shm.data_->length = map.size();
+
+  AllocatedSharedMemory<PairShm> pair_shms =
+      shm_pool->Construct<PairShm>(map.size());
+  dict_shm.data_->values = pair_shms.handle_;
+
+  size_t i = 0;
+  for (auto& pair : map) {
+    auto key = PbString::Create(shm_pool, pair.first);
+    auto value = PbString::Create(shm_pool, pair.second);
+
+    (pair_shms.data_.get())[i].key = key->ShmHandle();
+    (pair_shms.data_.get())[i].value = value->ShmHandle();
+
+    strings.emplace_back(std::move(key));
+    strings.emplace_back(std::move(value));
+    i++;
+  }
+
+  return std::unique_ptr<PbMap>(new PbMap(strings, dict_shm, pair_shms, map));
+}
+
+const std::unordered_map<std::string, std::string>&
+PbMap::UnorderedMap()
+{
+  return map_;
+}
+
+bi::managed_external_buffer::handle_t
+PbMap::ShmHandle()
+{
+  return dict_handle_;
+}
+
+std::unique_ptr<PbMap>
+PbMap::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle)
+{
+  AllocatedSharedMemory<DictShm> dict_shm = shm_pool->Load<DictShm>(handle);
+  AllocatedSharedMemory<PairShm> pair_shms =
+      shm_pool->Load<PairShm>(dict_shm.data_->values);
+
+  std::vector<std::unique_ptr<PbString>> pb_strings;
+  std::unordered_map<std::string, std::string> map;
+  for (size_t i = 0; i < dict_shm.data_->length; i++) {
+    std::unique_ptr<PbString> key = PbString::LoadFromSharedMemory(
+        shm_pool, (pair_shms.data_.get())[i].key);
+
+    std::unique_ptr<PbString> value = PbString::LoadFromSharedMemory(
+        shm_pool, (pair_shms.data_.get())[i].value);
+
+    map.insert({key->String(), value->String()});
+    pb_strings.emplace_back(std::move(key));
+    pb_strings.emplace_back(std::move(value));
+  }
+
+  return std::unique_ptr<PbMap>(
+      new PbMap(pb_strings, dict_shm, pair_shms, map));
+}
+
+PbMap::PbMap(
+    std::vector<std::unique_ptr<PbString>>& strings,
+    AllocatedSharedMemory<DictShm>& dict_shm,
+    AllocatedSharedMemory<PairShm>& pair_shms,
+    std::unordered_map<std::string, std::string>& map)
+    : strings_(std::move(strings)), dict_shm_(std::move(dict_shm)),
+      pair_shms_(std::move(pair_shms)), map_(std::move(map))
+{
+  dict_handle_ = dict_shm.handle_;
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_map.h b/src/pb_map.h
new file mode 100644
index 00000000..a231b719
--- /dev/null
+++ b/src/pb_map.h
@@ -0,0 +1,72 @@
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "pb_string.h"
+#include "shm_manager.h"
+
+namespace triton { namespace backend { namespace python {
+
+struct PairShm {
+  bi::managed_external_buffer::handle_t key;
+  bi::managed_external_buffer::handle_t value;
+};
+
+struct DictShm {
+  uint32_t length;
+  // `values` point to the location where there are `length` of Pair objects.
+  bi::managed_external_buffer::handle_t values;
+};
+
+
+class PbMap {
+ public:
+  static std::unique_ptr<PbMap> Create(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      std::unordered_map<std::string, std::string>& map);
+  static std::unique_ptr<PbMap> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+  const std::unordered_map<std::string, std::string>& UnorderedMap();
+  bi::managed_external_buffer::handle_t ShmHandle();
+
+ private:
+  PbMap(
+      std::vector<std::unique_ptr<PbString>>& strings,
+      AllocatedSharedMemory<DictShm>& dict_shm,
+      AllocatedSharedMemory<PairShm>& pair_shms,
+      std::unordered_map<std::string, std::string>& map);
+
+  std::vector<std::unique_ptr<PbString>> strings_;
+  AllocatedSharedMemory<DictShm> dict_shm_;
+  AllocatedSharedMemory<PairShm> pair_shms_;
+  bi::managed_external_buffer::handle_t dict_handle_;
+  std::unordered_map<std::string, std::string> map_;
+};
+}}}  // namespace triton::backend::python
diff --git a/src/pb_memory.cc b/src/pb_memory.cc
new file mode 100644
index 00000000..5b678f1a
--- /dev/null
+++ b/src/pb_memory.cc
@@ -0,0 +1,509 @@
+// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_memory.h"
+
+#include <sstream>
+
+namespace triton { namespace backend { namespace python {
+
+std::unique_ptr<PbMemory>
+PbMemory::Create(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    TRITONSERVER_MemoryType memory_type, int64_t memory_type_id,
+    uint64_t byte_size, char* data, bool copy_gpu)
+{
+  size_t requested_byte_size = sizeof(MemoryShm);
+  if (memory_type == TRITONSERVER_MEMORY_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    requested_byte_size += sizeof(cudaIpcMemHandle_t);
+#endif
+  } else {
+    requested_byte_size += byte_size;
+  }
+
+  AllocatedSharedMemory<char> memory_shm =
+      shm_pool->Construct<char>(requested_byte_size);
+
+  PbMemory::FillShmData(
+      shm_pool->GetCUDAMemoryPoolManager(), memory_type, memory_type_id,
+      byte_size, data, memory_shm.data_.get(), memory_shm.handle_, copy_gpu);
+
+  if (memory_type == TRITONSERVER_MEMORY_CPU) {
+    data = memory_shm.data_.get() + sizeof(MemoryShm);
+  }
+
+  std::unique_ptr<PbMemory> pb_memory(
+      new PbMemory(memory_shm, data, false /* opened_cuda_ipc_handle */));
+
+#ifdef TRITON_ENABLE_GPU
+  if (memory_type == TRITONSERVER_MEMORY_GPU) {
+    pb_memory->memory_shm_ptr_->gpu_pointer_offset =
+        pb_memory->GetGPUPointerOffset();
+  }
+#endif
+  return pb_memory;
+}
+
+#ifndef TRITON_PB_STUB
+std::unique_ptr<PbMemory>
+PbMemory::Create(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    std::unique_ptr<BackendMemory>&& backend_memory, bool copy_gpu)
+{
+  std::unique_ptr<PbMemory> pb_memory = PbMemory::Create(
+      shm_pool, backend_memory->MemoryType(), backend_memory->MemoryTypeId(),
+      backend_memory->ByteSize(), backend_memory->MemoryPtr(), copy_gpu);
+  pb_memory->backend_memory_ = std::move(backend_memory);
+
+  return pb_memory;
+}
+#endif
+
+std::unique_ptr<PbMemory>
+PbMemory::Create(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    TRITONSERVER_MemoryType memory_type, int64_t memory_type_id,
+    uint64_t byte_size, char* data, char* data_shm,
+    bi::managed_external_buffer::handle_t handle, bool copy_gpu)
+{
+  PbMemory::FillShmData(
+      shm_pool->GetCUDAMemoryPoolManager(), memory_type, memory_type_id,
+      byte_size, data, data_shm, handle, copy_gpu);
+
+  if (memory_type == TRITONSERVER_MEMORY_CPU) {
+    data = data_shm + sizeof(MemoryShm);
+  }
+
+  std::unique_ptr<PbMemory> pb_memory(
+      new PbMemory(data_shm, data, handle, false /* opened_cuda_ipc_handle */));
+
+#ifdef TRITON_ENABLE_GPU
+  if (memory_type == TRITONSERVER_MEMORY_GPU) {
+    pb_memory->memory_shm_ptr_->gpu_pointer_offset =
+        pb_memory->GetGPUPointerOffset();
+  }
+#endif
+
+  return pb_memory;
+}
+
+void
+PbMemory::CopyBuffer(
+    std::unique_ptr<PbMemory>& dst, std::unique_ptr<PbMemory>& src)
+{
+  if (src->ByteSize() != dst->ByteSize()) {
+    throw PythonBackendException(
+        "Failed to copy memory buffers. Source and destination byte size do "
+        "not match: " +
+        std::to_string(dst->ByteSize()) +
+        " != " + std::to_string(src->ByteSize()));
+  }
+
+  if (src->MemoryType() == TRITONSERVER_MEMORY_CPU &&
+      dst->MemoryType() == TRITONSERVER_MEMORY_CPU) {
+    std::memcpy(dst->DataPtr(), src->DataPtr(), dst->ByteSize());
+    return;
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  cudaMemcpyKind kind = cudaMemcpyHostToDevice;
+
+  if (src->MemoryType() == TRITONSERVER_MEMORY_CPU &&
+      dst->MemoryType() == TRITONSERVER_MEMORY_GPU) {
+    kind = cudaMemcpyHostToDevice;
+  } else if (
+      src->MemoryType() == TRITONSERVER_MEMORY_GPU &&
+      dst->MemoryType() == TRITONSERVER_MEMORY_CPU) {
+    kind = cudaMemcpyDeviceToHost;
+  } else if (
+      src->MemoryType() == TRITONSERVER_MEMORY_GPU &&
+      dst->MemoryType() == TRITONSERVER_MEMORY_GPU) {
+    kind = cudaMemcpyDeviceToDevice;
+  }
+
+  cudaError_t err;
+  if ((kind == cudaMemcpyDeviceToDevice) &&
+      (src->MemoryTypeId() != dst->MemoryTypeId())) {
+    err = cudaMemcpyPeer(
+        dst->DataPtr(), dst->MemoryTypeId(), src->DataPtr(),
+        src->MemoryTypeId(), src->ByteSize());
+
+  } else {
+    err = cudaMemcpy(dst->DataPtr(), src->DataPtr(), src->ByteSize(), kind);
+  }
+
+  if (err != cudaSuccess) {
+    throw PythonBackendException(
+        std::string(
+            "failed to copy data: " + std::string(cudaGetErrorString(err)))
+            .c_str());
+  }
+
+  if (kind == cudaMemcpyDeviceToDevice) {
+    // Synchronize the default stream for d2d copies.
+    // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html#api-sync-behavior__memcpy-sync
+    err = cudaStreamSynchronize(0);
+    if (err != cudaSuccess) {
+      throw PythonBackendException(
+          std::string(
+              "failed to synchronize the default CUDA stream. error: " +
+              std::string(cudaGetErrorString(err)))
+              .c_str());
+    }
+  }
+#endif
+}
+
+void
+PbMemory::FillShmData(
+    std::unique_ptr<CUDAMemoryPoolManager>& cuda_pool,
+    TRITONSERVER_MemoryType memory_type, int64_t memory_type_id,
+    uint64_t byte_size, char* data, char* data_shm,
+    bi::managed_external_buffer::handle_t handle, bool copy_gpu)
+{
+  char* memory_data_shm = data_shm + sizeof(MemoryShm);
+  MemoryShm* memory_shm_ptr = reinterpret_cast<MemoryShm*>(data_shm);
+  memory_shm_ptr->memory_release_id = 0;
+  bool use_cuda_shared_pool = false;
+
+  if (memory_type == TRITONSERVER_MEMORY_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    if (data != nullptr) {
+      if (copy_gpu) {
+        ScopedSetDevice scoped_set_device(memory_type_id);
+        THROW_IF_CUDA_ERROR(cudaIpcGetMemHandle(
+            reinterpret_cast<cudaIpcMemHandle_t*>(memory_data_shm), data));
+      }
+      if (cuda_pool->UseCudaSharedPool(memory_type_id) &&
+          IsUsingCUDAPool(cuda_pool, memory_type_id, data)) {
+        use_cuda_shared_pool = true;
+        memory_shm_ptr->cuda_pool_offset =
+            data -
+            reinterpret_cast<char*>(cuda_pool->CUDAPoolAddress(memory_type_id));
+      }
+    }
+#endif  // TRITON_ENABLE_GPU
+  } else {
+    if (data != nullptr) {
+      std::copy(data, data + byte_size, memory_data_shm);
+    }
+  }
+
+  memory_shm_ptr->byte_size = byte_size;
+  memory_shm_ptr->memory_type_id = memory_type_id;
+  memory_shm_ptr->memory_type = memory_type;
+  memory_shm_ptr->use_cuda_shared_pool = use_cuda_shared_pool;
+}
+
+std::unique_ptr<PbMemory>
+PbMemory::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle, char* data_shm,
+    bool open_cuda_handle)
+{
+  MemoryShm* memory_shm_ptr = reinterpret_cast<MemoryShm*>(data_shm);
+  char* memory_data_shm = data_shm + sizeof(MemoryShm);
+  char* data_ptr = nullptr;
+  bool opened_cuda_ipc_handle = false;
+  if (memory_shm_ptr->memory_type == TRITONSERVER_MEMORY_GPU &&
+      open_cuda_handle) {
+#ifdef TRITON_ENABLE_GPU
+    if (memory_shm_ptr->use_cuda_shared_pool) {
+      // When CUDA shared memory pool is used, the stub will retrieve the
+      // data pointer using the offset.
+      data_ptr =
+          (reinterpret_cast<char*>(
+               shm_pool->GetCUDAMemoryPoolManager()->CUDAPoolAddress(
+                   memory_shm_ptr->memory_type_id)) +
+           memory_shm_ptr->cuda_pool_offset);
+    } else {
+      cudaIpcMemHandle_t* cuda_handle =
+          reinterpret_cast<cudaIpcMemHandle_t*>(memory_data_shm);
+
+      // The pointer opened by the cudaIpcOpenMemHandle will refer to the base
+      // address. We need to manually correct the offset.
+      void* data_ptr_base;
+      CUDAHandler& cuda_handler = CUDAHandler::getInstance();
+      cuda_handler.OpenCudaHandle(
+          memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base);
+
+      data_ptr =
+          (reinterpret_cast<char*>(data_ptr_base) +
+           memory_shm_ptr->gpu_pointer_offset);
+      opened_cuda_ipc_handle = true;
+    }
+
+#endif  // TRITON_ENABLE_GPU
+  } else {
+    data_ptr = memory_data_shm;
+  }
+
+  // This check only validates CPU shared memory access.
+  if (memory_shm_ptr->memory_type != TRITONSERVER_MEMORY_GPU &&
+      (data_ptr + memory_shm_ptr->byte_size >
+       (char*)shm_pool->GetBaseAddress() + shm_pool->GetCurrentCapacity())) {
+    std::ostringstream oss;
+    oss << "0x" << std::hex
+        << (reinterpret_cast<uintptr_t>(data_ptr) + memory_shm_ptr->byte_size);
+    throw PythonBackendException(
+        std::string("Attempted to access out of bounds memory address ") +
+        oss.str());
+  }
+
+  return std::unique_ptr<PbMemory>(new PbMemory(
+      data_shm, data_ptr, handle,
+      opened_cuda_ipc_handle /* opened_cuda_ipc_handle */));
+}
+
+std::unique_ptr<PbMemory>
+PbMemory::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle, bool open_cuda_handle)
+{
+  AllocatedSharedMemory<char> memory_shm = shm_pool->Load<char>(handle);
+  MemoryShm* memory_shm_ptr =
+      reinterpret_cast<MemoryShm*>(memory_shm.data_.get());
+  char* memory_data_shm = memory_shm.data_.get() + sizeof(MemoryShm);
+
+  char* data_ptr = nullptr;
+  bool opened_cuda_ipc_handle = false;
+  if (memory_shm_ptr->memory_type == TRITONSERVER_MEMORY_GPU) {
+    if (memory_shm_ptr->byte_size > 0 && open_cuda_handle) {
+#ifdef TRITON_ENABLE_GPU
+      if (memory_shm_ptr->use_cuda_shared_pool) {
+        // When CUDA shared memory pool is used, the stub will retrieve the
+        // data pointer using the offset.
+        data_ptr =
+            (reinterpret_cast<char*>(
+                 shm_pool->GetCUDAMemoryPoolManager()->CUDAPoolAddress(
+                     memory_shm_ptr->memory_type_id)) +
+             memory_shm_ptr->cuda_pool_offset);
+      } else {
+        cudaIpcMemHandle_t* cuda_handle =
+            reinterpret_cast<cudaIpcMemHandle_t*>(memory_data_shm);
+
+        // The pointer opened by the cudaIpcOpenMemHandle will refer to the base
+        // address. We need to manually correct the offset.
+        void* data_ptr_base;
+        CUDAHandler& cuda_handler = CUDAHandler::getInstance();
+        cuda_handler.OpenCudaHandle(
+            memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base);
+
+        data_ptr =
+            (reinterpret_cast<char*>(data_ptr_base) +
+             memory_shm_ptr->gpu_pointer_offset);
+        opened_cuda_ipc_handle = true;
+      }
+#endif
+    }
+  } else {
+    data_ptr = memory_data_shm;
+  }
+
+  // This check only validates CPU shared memory access.
+  if (memory_shm_ptr->memory_type != TRITONSERVER_MEMORY_GPU &&
+      (data_ptr + memory_shm_ptr->byte_size >
+       (char*)shm_pool->GetBaseAddress() + shm_pool->GetCurrentCapacity())) {
+    std::ostringstream oss;
+    oss << "0x" << std::hex
+        << (reinterpret_cast<uintptr_t>(data_ptr) + memory_shm_ptr->byte_size);
+    throw PythonBackendException(
+        std::string("Attempted to access out of bounds memory address ") +
+        oss.str());
+  }
+
+  return std::unique_ptr<PbMemory>(new PbMemory(
+      memory_shm, data_ptr,
+      opened_cuda_ipc_handle /* opened_cuda_ipc_handle */));
+}
+
+PbMemory::PbMemory(
+    AllocatedSharedMemory<char>& memory_shm, char* data,
+    bool opened_cuda_ipc_handle)
+    : memory_shm_(std::move(memory_shm)), data_ptr_(data),
+      opened_cuda_ipc_handle_(opened_cuda_ipc_handle)
+{
+  memory_shm_ptr_ = reinterpret_cast<MemoryShm*>(memory_shm_.data_.get());
+  memory_shm_handle_ = memory_shm_.handle_;
+}
+
+PbMemory::PbMemory(
+    char* memory_shm, char* data, bi::managed_external_buffer::handle_t handle,
+    bool opened_cuda_ipc_handle)
+{
+  memory_shm_ptr_ = reinterpret_cast<MemoryShm*>(memory_shm);
+  data_ptr_ = data;
+  opened_cuda_ipc_handle_ = opened_cuda_ipc_handle;
+  memory_shm_handle_ = handle;
+}
+
+bi::managed_external_buffer::handle_t
+PbMemory::ShmHandle()
+{
+  return memory_shm_handle_;
+}
+
+#ifdef TRITON_ENABLE_GPU
+void*
+PbMemory::GetGPUStartAddress()
+{
+  if (memory_shm_ptr_->memory_type == TRITONSERVER_MEMORY_GPU) {
+    CUDAHandler& cuda_api = CUDAHandler::getInstance();
+    CUdeviceptr start_address = 0;
+
+    // Skip this step for empty tensor as the CUDA API 'cuPointerGetAttribute'
+    // we use in this function does not accept nullptr.
+    if (data_ptr_) {
+      cuda_api.PointerGetAttribute(
+          &start_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+          reinterpret_cast<CUdeviceptr>(data_ptr_));
+    }
+
+    return reinterpret_cast<void*>(start_address);
+  }
+
+  throw PythonBackendException(
+      "Calling GetGPUStartAddress function on CPU memory.");
+}
+
+uint64_t
+PbMemory::GetGPUPointerOffset()
+{
+  uint64_t offset;
+  if (memory_shm_ptr_->memory_type == TRITONSERVER_MEMORY_GPU) {
+    offset = data_ptr_ - reinterpret_cast<char*>(GetGPUStartAddress());
+  } else {
+    throw PythonBackendException(
+        "Calling GetGPUPointerOffset function on CPU tensor.");
+  }
+  return offset;
+}
+#endif
+
+TRITONSERVER_MemoryType
+PbMemory::MemoryType() const
+{
+  return memory_shm_ptr_->memory_type;
+}
+
+void
+PbMemory::SetMemoryReleaseId(uint64_t memory_release_id)
+{
+  memory_shm_ptr_->memory_release_id = memory_release_id;
+}
+
+int64_t
+PbMemory::MemoryTypeId() const
+{
+  return memory_shm_ptr_->memory_type_id;
+}
+
+uint64_t
+PbMemory::ByteSize() const
+{
+  return memory_shm_ptr_->byte_size;
+}
+
+char*
+PbMemory::ShmData() const
+{
+  return reinterpret_cast<char*>(memory_shm_ptr_) + sizeof(MemoryShm);
+}
+
+char*
+PbMemory::DataPtr() const
+{
+  return data_ptr_;
+}
+
+uint64_t
+PbMemory::ShmStructSize(TRITONSERVER_MemoryType memory_type, uint64_t byte_size)
+{
+  uint64_t total_memory_size = sizeof(MemoryShm);
+  if (memory_type == TRITONSERVER_MEMORY_GPU) {
+#ifdef TRITON_ENABLE_GPU
+    total_memory_size += sizeof(cudaIpcMemHandle_t);
+#endif
+  } else {
+    total_memory_size += byte_size;
+  }
+
+  return total_memory_size;
+}
+
+#ifdef TRITON_ENABLE_GPU
+void
+PbMemory::SetCudaIpcHandle(cudaIpcMemHandle_t* cuda_ipc_handle)
+{
+  *(reinterpret_cast<cudaIpcMemHandle_t*>(ShmData())) = *(cuda_ipc_handle);
+}
+
+void
+PbMemory::UpdateCUDAOffset(std::unique_ptr<CUDAMemoryPoolManager>& cuda_pool)
+{
+  if (cuda_pool->UseCudaSharedPool(MemoryTypeId()) &&
+      IsUsingCUDAPool(cuda_pool, MemoryTypeId(), DataPtr())) {
+    memory_shm_ptr_->cuda_pool_offset =
+        DataPtr() -
+        reinterpret_cast<char*>(cuda_pool->CUDAPoolAddress(MemoryTypeId()));
+    memory_shm_ptr_->use_cuda_shared_pool = true;
+  }
+}
+#endif
+
+PbMemory::~PbMemory()
+{
+  if (opened_cuda_ipc_handle_) {
+#ifdef TRITON_ENABLE_GPU
+    CUDAHandler& cuda_handler = CUDAHandler::getInstance();
+    cuda_handler.CloseCudaHandle(
+        memory_shm_ptr_->memory_type_id, GetGPUStartAddress());
+#endif
+  }
+
+  if (release_callback_) {
+    release_callback_();
+  }
+}
+
+void
+PbMemory::SetMemoryReleaseCallback(std::function<void(void)> release_callback)
+{
+  if (!release_callback_) {
+    release_callback_ = release_callback;
+  } else {
+    throw PythonBackendException("Release callback is already set.");
+  }
+}
+
+uint64_t
+PbMemory::MemoryReleaseId()
+{
+  return memory_shm_ptr_->memory_release_id;
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_memory.h b/src/pb_memory.h
new file mode 100644
index 00000000..ad79daed
--- /dev/null
+++ b/src/pb_memory.h
@@ -0,0 +1,193 @@
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "pb_utils.h"
+#include "shm_manager.h"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_memory.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace backend { namespace python {
+
+//
+// Represents a memory object in shared memory.
+//
+struct MemoryShm {
+  // If the memory type is a GPU pointer, the offset of the GPU pointer from the
+  // base address. For CPU memory type this field contains garbage data. This
+  // field will only be used when the memory is not allocated from the CUDA
+  // shared memory pool.
+  uint64_t gpu_pointer_offset;
+  bool use_cuda_shared_pool;
+  // The offset of the memory from the base address of the CUDA shared memory
+  // pool.
+  uint64_t cuda_pool_offset;
+
+  TRITONSERVER_MemoryType memory_type;
+  int64_t memory_type_id;
+  uint64_t byte_size;
+  uint64_t memory_release_id;
+};
+
+class PbMemory {
+ public:
+  static std::unique_ptr<PbMemory> Create(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      TRITONSERVER_MemoryType memory_type, int64_t memory_type_id,
+      uint64_t byte_size, char* data, bool copy_gpu = true);
+
+  static std::unique_ptr<PbMemory> Create(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      TRITONSERVER_MemoryType memory_type, int64_t memory_type_id,
+      uint64_t byte_size, char* data, char* data_shm,
+      bi::managed_external_buffer::handle_t handle, bool copy_gpu = true);
+
+#ifndef TRITON_PB_STUB
+  static std::unique_ptr<PbMemory> Create(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      std::unique_ptr<BackendMemory>&& backend_memory, bool copy_gpu = true);
+#endif
+
+#ifdef TRITON_ENABLE_GPU
+  void SetCudaIpcHandle(cudaIpcMemHandle_t* cuda_ipc_handle);
+
+  void UpdateCUDAOffset(std::unique_ptr<CUDAMemoryPoolManager>& cuda_pool);
+#endif
+
+  // Copy the destination buffer to the source buffer.
+  static void CopyBuffer(
+      std::unique_ptr<PbMemory>& dst, std::unique_ptr<PbMemory>& src);
+
+  static std::unique_ptr<PbMemory> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t memory_handle,
+      bool open_cuda_handle);
+  static std::unique_ptr<PbMemory> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle, char* data_shm,
+      bool open_cuda_handle);
+  static uint64_t ShmStructSize(
+      TRITONSERVER_MemoryType memory_type, uint64_t byte_size);
+
+  bi::managed_external_buffer::handle_t ShmHandle();
+
+  /// Get the total byte size of the tensor.
+  uint64_t ByteSize() const;
+
+  /// Get the triton memory type.
+  /// \return the memory type of the tensor.
+  TRITONSERVER_MemoryType MemoryType() const;
+
+  /// Get the pointer.
+  /// \return The location to the memory where the data is stored.
+  char* DataPtr() const;
+
+  /// Get the memory type id.
+  /// \return The memory type id of the tensor.
+  int64_t MemoryTypeId() const;
+
+  /// Get the shm data
+  /// \return The memory type id of the tensor.
+  char* ShmData() const;
+
+  /// Set the memory release id
+  void SetMemoryReleaseId(uint64_t memory_release_id);
+
+  /// Memory Release ID
+  uint64_t MemoryReleaseId();
+
+  void SetMemoryReleaseCallback(std::function<void(void)> release_callback);
+
+  bool UseCUDASharedPool() const
+  {
+    return memory_shm_ptr_->use_cuda_shared_pool;
+  }
+
+  ~PbMemory();
+
+#ifndef TRITON_PB_STUB
+  void SetBackendMemory(std::unique_ptr<BackendMemory>&& backend_memory)
+  {
+    backend_memory_ = std::move(backend_memory);
+  };
+
+  std::unique_ptr<BackendMemory> GetBackendMemory()
+  {
+    return std::move(backend_memory_);
+  };
+#endif
+
+ private:
+  AllocatedSharedMemory<char> memory_shm_;
+  MemoryShm* memory_shm_ptr_;
+
+#ifndef TRITON_PB_STUB
+  std::unique_ptr<BackendMemory> backend_memory_;
+#endif
+
+  std::function<void()> release_callback_;
+
+  // Refers to the pointer that can hold the data. For CPU pointers this will be
+  // the same as memory_data_shm_ptr_.
+  char* data_ptr_;
+
+  bi::managed_external_buffer::handle_t memory_shm_handle_;
+  bool opened_cuda_ipc_handle_;
+
+#ifdef TRITON_ENABLE_GPU
+  /// Calculate the pointer offset from the base address.
+  /// \return The offset of a device pointer.
+  /// \throws PythonBackendException if the tensor is stored in CPU.
+  uint64_t GetGPUPointerOffset();
+
+  /// Get the GPU start address.
+  /// \return The start address of a device pointer.
+  /// \throws PythonBackendException if the tensor is stored in CPU.
+  void* GetGPUStartAddress();
+
+#endif
+
+  static void FillShmData(
+      std::unique_ptr<CUDAMemoryPoolManager>& cuda_pool,
+      TRITONSERVER_MemoryType memory_type, int64_t memory_type_id,
+      uint64_t byte_size, char* data, char* data_shm,
+      bi::managed_external_buffer::handle_t handle, bool copy_gpu = true);
+
+  PbMemory(
+      AllocatedSharedMemory<char>& memory_shm, char* data,
+      bool opened_cuda_ipc_handle);
+
+  PbMemory(
+      char* memory_shm, char* data,
+      bi::managed_external_buffer::handle_t handle,
+      bool opened_cuda_ipc_handle);
+};
+}}}  // namespace triton::backend::python
diff --git a/src/pb_metric_reporter.cc b/src/pb_metric_reporter.cc
new file mode 100644
index 00000000..19362905
--- /dev/null
+++ b/src/pb_metric_reporter.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_metric_reporter.h"
+
+#include "triton/backend/backend_common.h"
+
+namespace triton { namespace backend { namespace python {
+
+PbMetricReporter::PbMetricReporter(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+    const uint32_t request_count,
+    std::shared_ptr<std::vector<TRITONBACKEND_Response*>> responses)
+    : instance_(instance), requests_(requests), request_count_(request_count),
+      responses_(responses), total_batch_size_(0), exec_start_ns_(0),
+      compute_start_ns_(0), compute_end_ns_(0), exec_end_ns_(0),
+      success_status_(true)
+{
+}
+
+PbMetricReporter::~PbMetricReporter()
+{
+  for (uint32_t r = 0; r < request_count_; ++r) {
+    TRITONBACKEND_Request* request = requests_[r];
+
+    // Report statistics for the request. Note that there could
+    // still be responses that have not yet been sent but those
+    // cannot be captured in the statistics as they reflect only the
+    // request object. We use the execution start/end time for
+    // compute also so that the entire execution time is associated
+    // with the inference computation.
+    if (responses_) {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ModelInstanceReportStatistics(
+              instance_, request, ((*responses_)[r] != nullptr) /* success */,
+              exec_start_ns_, compute_start_ns_, compute_end_ns_, exec_end_ns_),
+          "failed reporting request statistics");
+    } else {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ModelInstanceReportStatistics(
+              instance_, request, success_status_, exec_start_ns_,
+              compute_start_ns_, compute_end_ns_, exec_end_ns_),
+          "failed reporting request statistics");
+    }
+  }
+
+  // Report the entire batch statistics. This backend does not support
+  // batching so the total batch size is always 1.
+  if (total_batch_size_ != 0) {
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportBatchStatistics(
+            instance_, total_batch_size_, exec_start_ns_, compute_start_ns_,
+            compute_end_ns_, exec_end_ns_),
+        "failed reporting batch request statistics");
+  }
+}
+
+void
+PbMetricReporter::SetBatchStatistics(size_t total_batch_size)
+{
+  total_batch_size_ = total_batch_size;
+}
+
+void
+PbMetricReporter::SetExecStartNs(const uint64_t exec_start_ns)
+{
+  exec_start_ns_ = exec_start_ns;
+}
+
+void
+PbMetricReporter::SetComputeStartNs(const uint64_t compute_start_ns)
+{
+  compute_start_ns_ = compute_start_ns;
+}
+
+void
+PbMetricReporter::SetComputeEndNs(const uint64_t compute_end_ns)
+{
+  compute_end_ns_ = compute_end_ns;
+}
+
+void
+PbMetricReporter::SetExecEndNs(const uint64_t exec_end_ns)
+{
+  exec_end_ns_ = exec_end_ns;
+}
+
+void
+PbMetricReporter::SetSuccessStatus(const bool success_status)
+{
+  success_status_ = success_status;
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_metric_reporter.h b/src/pb_metric_reporter.h
new file mode 100644
index 00000000..89c81b38
--- /dev/null
+++ b/src/pb_metric_reporter.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "triton/core/tritonbackend.h"
+
+namespace triton { namespace backend { namespace python {
+class PbMetricReporter {
+  TRITONBACKEND_ModelInstance* instance_;
+  TRITONBACKEND_Request** requests_;
+  uint32_t request_count_;
+  std::shared_ptr<std::vector<TRITONBACKEND_Response*>> responses_;
+  size_t total_batch_size_;
+  uint64_t exec_start_ns_;
+  uint64_t compute_start_ns_;
+  uint64_t compute_end_ns_;
+  uint64_t exec_end_ns_;
+  bool success_status_;
+
+ public:
+  PbMetricReporter(
+      TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+      const uint32_t request_count,
+      std::shared_ptr<std::vector<TRITONBACKEND_Response*>> responses);
+  ~PbMetricReporter();
+  void SetBatchStatistics(size_t total_batch_size);
+  void SetExecStartNs(const uint64_t exec_start_ns);
+  void SetComputeStartNs(const uint64_t compute_start_ns);
+  void SetComputeEndNs(const uint64_t compute_end_ns);
+  void SetExecEndNs(const uint64_t exec_end_ns);
+  void SetSuccessStatus(const bool success_status);
+};
+}}};  // namespace triton::backend::python
diff --git a/src/pb_preferred_memory.h b/src/pb_preferred_memory.h
new file mode 100644
index 00000000..c28f1b87
--- /dev/null
+++ b/src/pb_preferred_memory.h
@@ -0,0 +1,57 @@
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+namespace triton { namespace backend { namespace python {
+
+class PreferredMemory {
+ public:
+  enum MemoryType { kGPU, kCPU, kDefault };
+
+  PreferredMemory()
+      : preferred_memory_type_(MemoryType::kDefault), preferred_device_id_(0)
+  {
+  }
+
+  PreferredMemory(
+      const MemoryType& preferred_memory_type,
+      const int64_t& preferred_device_id)
+      : preferred_memory_type_(preferred_memory_type),
+        preferred_device_id_(preferred_device_id)
+  {
+  }
+
+  MemoryType PreferredMemoryType() { return preferred_memory_type_; }
+
+  int64_t PreferredDeviceId() { return preferred_device_id_; }
+
+ private:
+  MemoryType preferred_memory_type_;
+  int64_t preferred_device_id_;
+};
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_response_iterator.cc b/src/pb_response_iterator.cc
new file mode 100644
index 00000000..536d4232
--- /dev/null
+++ b/src/pb_response_iterator.cc
@@ -0,0 +1,171 @@
+// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_response_iterator.h"
+
+#include <pybind11/embed.h>
+
+#include <chrono>
+
+#include "pb_stub.h"
+namespace py = pybind11;
+
+namespace triton { namespace backend { namespace python {
+
+ResponseIterator::ResponseIterator(
+    const std::shared_ptr<InferResponse>& response)
+    : id_(response->Id()), is_finished_(false), is_cleared_(false), idx_(0)
+{
+  response_buffer_.push(response);
+  pb_bls_cancel_ = std::make_shared<PbBLSCancel>(response->Id());
+}
+
+ResponseIterator::~ResponseIterator()
+{
+  // Fetch all the remaining responses if not finished yet.
+  if (!is_finished_) {
+    bool done = false;
+    while (!done) {
+      try {
+        Next();
+      }
+      catch (const py::stop_iteration& exception) {
+        done = true;
+      }
+    }
+  }
+
+  if (!is_cleared_) {
+    Clear();
+  }
+  responses_.clear();
+}
+
+std::shared_ptr<InferResponse>
+ResponseIterator::Next()
+{
+  if (is_finished_) {
+    if (!is_cleared_) {
+      Clear();
+    }
+
+    if (idx_ < responses_.size()) {
+      return responses_[idx_++];
+    } else {
+      throw py::stop_iteration("Iteration is done for the responses.");
+    }
+  } else {
+    std::shared_ptr<InferResponse> response;
+    {
+      {
+        std::unique_lock<std::mutex> lock{mu_};
+        while (response_buffer_.empty()) {
+          py::gil_scoped_release release;
+          cv_.wait(lock);
+        }
+        response = response_buffer_.front();
+        response_buffer_.pop();
+        is_finished_ = response->IsLastResponse();
+        responses_.push_back(response);
+      }
+    }
+
+    if (is_finished_) {
+      idx_ = responses_.size();
+      Clear();
+    }
+    return response;
+  }
+}
+
+void
+ResponseIterator::Iter()
+{
+  if (is_finished_) {
+    // If the previous iteration is finished, reset the index so that it will
+    // iterator from the beginning of the responses. Otherwise just resume the
+    // iteration from the previous index.
+    if (idx_ >= responses_.size()) {
+      idx_ = 0;
+    }
+  }
+}
+
+void
+ResponseIterator::EnqueueResponse(std::shared_ptr<InferResponse> infer_response)
+{
+  {
+    std::lock_guard<std::mutex> lock{mu_};
+    response_buffer_.push(infer_response);
+  }
+  cv_.notify_one();
+}
+
+void*
+ResponseIterator::Id()
+{
+  return id_;
+}
+
+void
+ResponseIterator::Clear()
+{
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  stub->EnqueueCleanupId(id_, PYTHONSTUB_BLSDecoupledInferPayloadCleanup);
+  {
+    std::lock_guard<std::mutex> lock{mu_};
+    response_buffer_.push(DUMMY_MESSAGE);
+  }
+  cv_.notify_all();
+  std::queue<std::shared_ptr<InferResponse>> empty;
+  std::swap(response_buffer_, empty);
+  is_cleared_ = true;
+}
+
+std::vector<std::shared_ptr<InferResponse>>
+ResponseIterator::GetExistingResponses()
+{
+  std::vector<std::shared_ptr<InferResponse>> responses;
+  std::unique_lock<std::mutex> lock{mu_};
+  while (!response_buffer_.empty()) {
+    responses.push_back(response_buffer_.front());
+    response_buffer_.pop();
+  }
+  is_finished_ = true;
+  is_cleared_ = true;
+
+  return responses;
+}
+
+void
+ResponseIterator::Cancel()
+{
+  if (!is_finished_) {
+    pb_bls_cancel_->Cancel();
+  }
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_response_iterator.h b/src/pb_response_iterator.h
new file mode 100644
index 00000000..cb26d6a3
--- /dev/null
+++ b/src/pb_response_iterator.h
@@ -0,0 +1,61 @@
+// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <queue>
+
+#include "infer_response.h"
+#include "pb_bls_cancel.h"
+
+namespace triton { namespace backend { namespace python {
+
+class ResponseIterator {
+ public:
+  ResponseIterator(const std::shared_ptr<InferResponse>& response);
+  ~ResponseIterator();
+
+  std::shared_ptr<InferResponse> Next();
+  void Iter();
+  void EnqueueResponse(std::shared_ptr<InferResponse> infer_response);
+  void* Id();
+  void Clear();
+  std::vector<std::shared_ptr<InferResponse>> GetExistingResponses();
+  void Cancel();
+
+ private:
+  std::vector<std::shared_ptr<InferResponse>> responses_;
+  std::queue<std::shared_ptr<InferResponse>> response_buffer_;
+  std::mutex mu_;
+  std::condition_variable cv_;
+  void* id_;
+  bool is_finished_;
+  bool is_cleared_;
+  size_t idx_;
+  std::shared_ptr<PbBLSCancel> pb_bls_cancel_;
+};
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_string.cc b/src/pb_string.cc
new file mode 100644
index 00000000..4f8a1227
--- /dev/null
+++ b/src/pb_string.cc
@@ -0,0 +1,126 @@
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_string.h"
+
+namespace triton { namespace backend { namespace python {
+
+std::unique_ptr<PbString>
+PbString::Create(
+    std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& string)
+{
+  AllocatedSharedMemory<StringShm> string_container_shm =
+      shm_pool->Construct<StringShm>();
+  string_container_shm.data_->length = string.size();
+
+  AllocatedSharedMemory<char> string_shm =
+      shm_pool->Construct<char>(string.size());
+  std::memcpy(string_shm.data_.get(), string.data(), string.size());
+  string_container_shm.data_->data = string_shm.handle_;
+
+  return std::unique_ptr<PbString>(
+      new PbString(string_container_shm, string_shm));
+}
+
+std::unique_ptr<PbString>
+PbString::Create(
+    const std::string& string, char* data_shm,
+    bi::managed_external_buffer::handle_t handle)
+{
+  StringShm* string_container_shm = reinterpret_cast<StringShm*>(data_shm);
+  string_container_shm->length = string.size();
+
+  char* string_shm = data_shm + sizeof(StringShm);
+  std::memcpy(string_shm, string.data(), string.size());
+
+  return std::unique_ptr<PbString>(
+      new PbString(string_container_shm, string_shm, handle));
+}
+
+std::unique_ptr<PbString>
+PbString::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t handle)
+{
+  AllocatedSharedMemory<StringShm> string_container_shm =
+      shm_pool->Load<StringShm>(handle);
+  AllocatedSharedMemory<char> string_shm =
+      shm_pool->Load<char>(string_container_shm.data_->data);
+
+  return std::unique_ptr<PbString>(
+      new PbString(string_container_shm, string_shm));
+}
+
+std::unique_ptr<PbString>
+PbString::LoadFromSharedMemory(
+    bi::managed_external_buffer::handle_t handle, char* data_shm)
+{
+  StringShm* string_container_shm = reinterpret_cast<StringShm*>(data_shm);
+  char* string_shm = data_shm + sizeof(StringShm);
+
+  return std::unique_ptr<PbString>(
+      new PbString(string_container_shm, string_shm, handle));
+}
+
+PbString::PbString(
+    AllocatedSharedMemory<StringShm>& string_container_shm,
+    AllocatedSharedMemory<char>& string_shm)
+    : string_container_shm_(std::move(string_container_shm)),
+      string_shm_(std::move(string_shm))
+{
+  string_shm_ptr_ = string_shm_.data_.get();
+  string_container_shm_ptr_ = string_container_shm_.data_.get();
+  string_handle_ = string_container_shm_.handle_;
+}
+
+PbString::PbString(
+    StringShm* string_container_shm, char* string_shm,
+    bi::managed_external_buffer::handle_t handle)
+{
+  string_shm_ptr_ = string_shm;
+  string_container_shm_ptr_ = string_container_shm;
+  string_handle_ = handle;
+}
+
+bi::managed_external_buffer::handle_t
+PbString::ShmHandle()
+{
+  return string_handle_;
+}
+
+std::size_t
+PbString::ShmStructSize(const std::string& string)
+{
+  return string.size() + sizeof(StringShm);
+}
+
+std::size_t
+PbString::Size()
+{
+  return string_container_shm_ptr_->length + sizeof(StringShm);
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_string.h b/src/pb_string.h
new file mode 100644
index 00000000..5d1ecff0
--- /dev/null
+++ b/src/pb_string.h
@@ -0,0 +1,80 @@
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "shm_manager.h"
+
+namespace triton { namespace backend { namespace python {
+
+struct StringShm {
+  bi::managed_external_buffer::handle_t data;
+  size_t length;
+};
+
+class PbString {
+ public:
+  static std::unique_ptr<PbString> Create(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      const std::string& string);
+  static std::unique_ptr<PbString> Create(
+      const std::string& string, char* data_shm,
+      bi::managed_external_buffer::handle_t handle);
+  static std::unique_ptr<PbString> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t handle);
+  static std::unique_ptr<PbString> LoadFromSharedMemory(
+      bi::managed_external_buffer::handle_t handle, char* data_shm);
+  static std::size_t ShmStructSize(const std::string& string);
+
+  char* MutableString() { return string_shm_ptr_; }
+  std::string String()
+  {
+    return std::string(
+        string_shm_ptr_, string_shm_ptr_ + string_container_shm_ptr_->length);
+  }
+  bi::managed_external_buffer::handle_t ShmHandle();
+  std::size_t Size();
+
+ private:
+  AllocatedSharedMemory<StringShm> string_container_shm_;
+  StringShm* string_container_shm_ptr_;
+
+  AllocatedSharedMemory<char> string_shm_;
+  char* string_shm_ptr_;
+
+  bi::managed_external_buffer::handle_t string_handle_;
+
+  PbString(
+      AllocatedSharedMemory<StringShm>& string_container_shm,
+      AllocatedSharedMemory<char>& string_shm);
+
+  PbString(
+      StringShm* string_container_shm, char* string_shm,
+      bi::managed_external_buffer::handle_t handle);
+};
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
index 6eaa1c8a..56048d78 100644
--- a/src/pb_stub.cc
+++ b/src/pb_stub.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -24,688 +24,1986 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include <pybind11/embed.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
+#include "pb_stub.h"
+
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <sys/wait.h>
+
 #include <atomic>
 #include <boost/interprocess/sync/interprocess_condition.hpp>
 #include <boost/interprocess/sync/interprocess_mutex.hpp>
 #include <boost/interprocess/sync/scoped_lock.hpp>
 #include <boost/thread/thread_time.hpp>
+#include <cstdlib>
 #include <iomanip>
 #include <iostream>
 #include <memory>
+#include <regex>
 #include <thread>
 #include <unordered_map>
+
+#include "correlation_id.h"
+#include "model_loader.h"
+#include "pb_error.h"
+#include "pb_map.h"
+#include "pb_preferred_memory.h"
+#include "pb_response_iterator.h"
+#include "pb_string.h"
+#include "pb_stub_log.h"
 #include "pb_utils.h"
+#include "response_sender.h"
+#include "scoped_defer.h"
 #include "shm_manager.h"
+#include "triton/common/nvtx.h"
+
+#ifdef _WIN32
+#include <signal.h>  // SIGINT & SIGTERM
+#include <windows.h>
+#else
+#include <sys/wait.h>
+#endif
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
 
 namespace py = pybind11;
 using namespace pybind11::literals;
 namespace bi = boost::interprocess;
+#ifndef TRITON_ENABLE_GPU
+using cudaStream_t = void*;
+#endif
 
 namespace triton { namespace backend { namespace python {
 
-#define LOG_IF_EXCEPTION(X)                              \
-  do {                                                   \
-    try {                                                \
-      (X);                                               \
-    }                                                    \
-    catch (const PythonBackendException& pb_exception) { \
-      LOG_INFO << pb_exception.what();                   \
-    }                                                    \
-  } while (false)
-
-#define LOG_EXCEPTION(E)  \
-  do {                    \
-    LOG_INFO << E.what(); \
-  } while (false)
-
-// Macros that use current filename and line number.
-#define LOG_INFO LOG_INFO_FL(__FILE__, __LINE__)
-
-class Logger {
- public:
-  // Log a message.
-  void Log(const std::string& msg) { std::cerr << msg << std::endl; }
-
-  // Flush the log.
-  void Flush() { std::cerr << std::flush; }
-};
-
-Logger gLogger_;
-class LogMessage {
- public:
-  LogMessage(const char* file, int line)
-  {
-    std::string path(file);
-    size_t pos = path.rfind('/');
-    if (pos != std::string::npos) {
-      path = path.substr(pos + 1, std::string::npos);
+std::atomic<bool> non_graceful_exit = {false};
+
+void
+SignalHandler(int signum)
+{
+  // Skip the SIGINT and SIGTERM
+}
+
+template <typename PYTYPE>
+PYTYPE
+PyDefaultArgumentToMutableType(const py::object& argument)
+{
+  // The default argument on Python functions always reference the same copy,
+  // meaning if the default argument is changed by the function, then it is
+  // changed for all subsequent calls to the function. Thus, default arguments
+  // should be limited to basic types (i.e. None). This helper function returns
+  // an empty expected type, if the argument is None (i.e. default initialized).
+  // If the argument is neither None nor expected type, an exception is thrown.
+  if (py::isinstance<py::none>(argument)) {
+    return PYTYPE();
+  }
+  if (py::isinstance<PYTYPE>(argument)) {
+    return argument;
+  }
+  throw PythonBackendException(
+      std::string("Expect ") + typeid(PYTYPE).name() + ", got " +
+      std::string(py::str(argument.get_type())));
+}
+
+std::string
+PyParametersToJSON(const py::dict& parameters)
+{
+  for (const auto& pair : parameters) {
+    if (!py::isinstance<py::str>(pair.first)) {
+      throw PythonBackendException(
+          "Expect parameters keys to have type str, found type " +
+          std::string(py::str(pair.first.get_type())));
+    }
+    if (!py::isinstance<py::bool_>(pair.second) &&
+        !py::isinstance<py::int_>(pair.second) &&
+        !py::isinstance<py::str>(pair.second)) {
+      throw PythonBackendException(
+          "Expect parameters values to have type bool/int/str, found type " +
+          std::string(py::str(pair.second.get_type())));
     }
+  }
+  py::module_ py_json = py::module_::import("json");
+  std::string parameters_str = py::str(py_json.attr("dumps")(parameters));
+  return parameters_str;
+}
+
+void
+AsyncEventFutureDoneCallback(const py::object& py_future)
+{
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  stub->BackgroundFutureDone(py_future);
+}
 
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    struct tm tm_time;
-    gmtime_r(((time_t*)&(tv.tv_sec)), &tm_time);
-    stream_ << std::setfill('0') << std::setw(2) << (tm_time.tm_mon + 1)
-            << std::setw(2) << tm_time.tm_mday << " " << std::setw(2)
-            << tm_time.tm_hour << ':' << std::setw(2) << tm_time.tm_min << ':'
-            << std::setw(2) << tm_time.tm_sec << "." << std::setw(6)
-            << tv.tv_usec << ' ' << static_cast<uint32_t>(getpid()) << ' '
-            << path << ':' << line << "] ";
+void
+Stub::Instantiate(
+    int64_t shm_growth_size, int64_t shm_default_size,
+    const std::string& shm_region_name, const std::string& model_path,
+    const std::string& model_version, const std::string& triton_install_path,
+    bi::managed_external_buffer::handle_t ipc_control_handle,
+    const std::string& name, const std::string& python_runtime_model)
+{
+  model_context_.Init(
+      model_path, python_runtime_model, triton_install_path, model_version);
+  name_ = name;
+  health_mutex_ = nullptr;
+  initialized_ = false;
+  finalizing_ = false;
+  stub_to_parent_thread_ = false;
+  parent_to_stub_thread_ = false;
+
+  try {
+    shm_pool_ = std::make_unique<SharedMemoryManager>(
+        shm_region_name, shm_default_size, shm_growth_size, false /* create */);
+
+    AllocatedSharedMemory<IPCControlShm> ipc_control =
+        shm_pool_->Load<IPCControlShm>(ipc_control_handle);
+    ipc_control_ = ipc_control.data_.get();
+
+    health_mutex_ = &(ipc_control_->stub_health_mutex);
+
+    stub_message_queue_ = MessageQueue<bi::managed_external_buffer::handle_t>::
+        LoadFromSharedMemory(shm_pool_, ipc_control_->stub_message_queue);
+
+    parent_message_queue_ =
+        MessageQueue<bi::managed_external_buffer::handle_t>::
+            LoadFromSharedMemory(shm_pool_, ipc_control_->parent_message_queue);
+
+    stub_to_parent_mq_ = MessageQueue<bi::managed_external_buffer::handle_t>::
+        LoadFromSharedMemory(shm_pool_, ipc_control_->stub_to_parent_mq);
+
+    parent_to_stub_mq_ = MessageQueue<bi::managed_external_buffer::handle_t>::
+        LoadFromSharedMemory(shm_pool_, ipc_control_->parent_to_stub_mq);
+
+    memory_manager_message_queue_ =
+        MessageQueue<uint64_t>::LoadFromSharedMemory(
+            shm_pool_, ipc_control_->memory_manager_message_queue);
+
+    // If the Python model is using an execution environment, we need to
+    // remove the first part of the LD_LIBRARY_PATH before the colon (i.e.
+    // <Python Shared Lib>:$OLD_LD_LIBRARY_PATH). The <Python Shared Lib>
+    // section was added before launching the stub process and it may
+    // interfere with the shared library resolution of other executable and
+    // binaries.
+    if (ipc_control_->uses_env) {
+#ifndef _WIN32
+      char* ld_library_path = std::getenv("LD_LIBRARY_PATH");
+
+      if (ld_library_path != nullptr) {
+        std::string ld_library_path_str = ld_library_path;
+        // If we use an Execute Environment, the path must contain a colon.
+        size_t find_pos = ld_library_path_str.find(':');
+        if (find_pos == std::string::npos) {
+          throw PythonBackendException(
+              "LD_LIBRARY_PATH must contain a colon when passing an "
+              "execution environment.");
+        }
+        ld_library_path_str = ld_library_path_str.substr(find_pos + 1);
+        int status = setenv(
+            "LD_LIBRARY_PATH", const_cast<char*>(ld_library_path_str.c_str()),
+            1 /* overwrite */);
+        if (status != 0) {
+          throw PythonBackendException(
+              "Failed to correct the LD_LIBRARY_PATH environment in the "
+              "Python backend stub.");
+        }
+      } else {
+        throw PythonBackendException(
+            "When using an execution environment, LD_LIBRARY_PATH variable "
+            "cannot be empty.");
+      }
+#else
+      throw PythonBackendException(
+          "Custom execution environments are not currently supported on "
+          "Windows.");
+#endif
+    }
   }
+  catch (const PythonBackendException& pb_exception) {
+    LOG_INFO << pb_exception.what() << std::endl;
+    exit(1);
+  }
+}
 
-  ~LogMessage() { gLogger_.Log(stream_.str()); }
+std::unique_ptr<MessageQueue<uint64_t>>&
+Stub::MemoryManagerQueue()
+{
+  return memory_manager_message_queue_;
+}
 
-  std::stringstream& stream() { return stream_; }
+bool&
+Stub::Health()
+{
+  return ipc_control_->stub_health;
+}
 
- private:
-  std::stringstream stream_;
-};
+std::unique_ptr<SharedMemoryManager>&
+Stub::SharedMemory()
+{
+  return shm_pool_;
+}
 
-#define LOG_INFO_FL(FN, LN) LogMessage((char*)(FN), LN).stream()
+std::unique_ptr<IPCMessage>
+Stub::PopMessage()
+{
+  bool success = false;
+  std::unique_ptr<IPCMessage> ipc_message;
+  bi::managed_external_buffer::handle_t message;
+  while (!success) {
+    message = stub_message_queue_->Pop(1000, success);
+  }
 
-void
-SignalHandler(int signum)
+  ipc_message = IPCMessage::LoadFromSharedMemory(shm_pool_, message);
+
+  return ipc_message;
+}
+
+bool
+Stub::IsDecoupled()
 {
-  // Skip the SIGINT
+  return ipc_control_->decoupled;
 }
 
-bool sigterm_received = false;
+bool
+Stub::RunCommand()
+{
+  NVTX_RANGE(nvtx_, "RunCommand " + name_);
+  std::unique_ptr<IPCMessage> ipc_message;
+  {
+    // Release the GIL lock when waiting for new message. Without this line, the
+    // other threads in the user's Python model cannot make progress if they
+    // give up GIL.
+    py::gil_scoped_release release;
+    ipc_message = this->PopMessage();
+  }
+  switch (ipc_message->Command()) {
+    case PYTHONSTUB_CommandType::PYTHONSTUB_AutoCompleteRequest: {
+      // Only run this case when auto complete was requested by
+      // Triton core.
+      bool has_exception = false;
+      std::string error_string;
+      std::string auto_complete_config;
+
+      std::unique_ptr<IPCMessage> auto_complete_response_msg =
+          IPCMessage::Create(shm_pool_, false /* inline_response */);
+      auto_complete_response_msg->Command() = PYTHONSTUB_AutoCompleteResponse;
+      std::unique_ptr<PbString> error_string_shm;
+      std::unique_ptr<PbString> auto_complete_config_shm;
+      AllocatedSharedMemory<AutoCompleteResponseShm> auto_complete_response =
+          shm_pool_->Construct<AutoCompleteResponseShm>();
+
+      ScopedDefer receive_autocomplete_finalize(
+          [this] { stub_message_queue_->Pop(); });
+      ScopedDefer _([this, &auto_complete_response_msg] {
+        SendIPCMessage(auto_complete_response_msg);
+      });
+
+      auto_complete_response.data_->response_has_error = false;
+      auto_complete_response.data_->response_is_error_set = false;
+      auto_complete_response.data_->response_has_model_config = false;
+      auto_complete_response_msg->Args() = auto_complete_response.handle_;
+
+      try {
+        AutoCompleteModelConfig(ipc_message->Args(), &auto_complete_config);
+      }
+      catch (const PythonBackendException& pb_exception) {
+        has_exception = true;
+        error_string = pb_exception.what();
+      }
+      catch (const py::error_already_set& error) {
+        has_exception = true;
+        error_string = error.what();
+      }
+
+      if (has_exception) {
+        // Do not delete the region. The region will be deleted by the parent
+        // process.
+        shm_pool_->SetDeleteRegion(false);
+        LOG_INFO << "Failed to initialize Python stub for auto-complete: "
+                 << error_string;
+        auto_complete_response.data_->response_has_error = true;
+        auto_complete_response.data_->response_is_error_set = false;
+
+        LOG_IF_EXCEPTION(
+            error_string_shm = PbString::Create(shm_pool_, error_string));
+        if (error_string_shm != nullptr) {
+          auto_complete_response.data_->response_is_error_set = true;
+          auto_complete_response.data_->response_error =
+              error_string_shm->ShmHandle();
+        }
+
+        return true;  // Terminate the stub process.
+      } else {
+        LOG_IF_EXCEPTION(
+            auto_complete_config_shm =
+                PbString::Create(shm_pool_, auto_complete_config));
+        if (auto_complete_config_shm != nullptr) {
+          auto_complete_response.data_->response_has_model_config = true;
+          auto_complete_response.data_->response_model_config =
+              auto_complete_config_shm->ShmHandle();
+        }
+      }
+    } break;
+    case PYTHONSTUB_CommandType::PYTHONSTUB_InitializeRequest: {
+      bool has_exception = false;
+      std::string error_string;
+
+      std::unique_ptr<IPCMessage> initialize_response_msg =
+          IPCMessage::Create(shm_pool_, false /* inline_response */);
+      initialize_response_msg->Command() = PYTHONSTUB_InitializeResponse;
+      std::unique_ptr<PbString> error_string_shm;
+      AllocatedSharedMemory<InitializeResponseShm> initialize_response =
+          shm_pool_->Construct<InitializeResponseShm>();
+
+      // The initialization is done in three steps. First the main process sends
+      // a message to the stub process asking to begin to initialize the Python
+      // model. After that is finished stub process sends a message to the
+      // parent process that the initialization is finished.  Finally, the
+      // parent process sends a message to the stub process asking the stub
+      // process to release any objects it has held in shared memory.
+      ScopedDefer receive_initialize_finalize(
+          [this] { stub_message_queue_->Pop(); });
+      ScopedDefer _([this, &initialize_response_msg] {
+        SendIPCMessage(initialize_response_msg);
+      });
+
+      initialize_response.data_->response_has_error = false;
+      initialize_response.data_->response_is_error_set = false;
+      initialize_response_msg->Args() = initialize_response.handle_;
+
+      try {
+        Initialize(ipc_message->Args());
+      }
+      catch (const PythonBackendException& pb_exception) {
+        has_exception = true;
+        error_string = pb_exception.what();
+      }
+      catch (const py::error_already_set& error) {
+        has_exception = true;
+        error_string = error.what();
+      }
+
+      if (has_exception) {
+        // Do not delete the region. The region will be deleted by the parent
+        // process.
+        shm_pool_->SetDeleteRegion(false);
+        LOG_INFO << "Failed to initialize Python stub: " << error_string;
+        initialize_response.data_->response_has_error = true;
+        initialize_response.data_->response_is_error_set = false;
+
+        LOG_IF_EXCEPTION(
+            error_string_shm = PbString::Create(shm_pool_, error_string));
+        if (error_string_shm != nullptr) {
+          initialize_response.data_->response_is_error_set = true;
+          initialize_response.data_->response_error =
+              error_string_shm->ShmHandle();
+        }
+
+        return true;  // Terminate the stub process.
+      }
+    } break;
+    case PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest: {
+      AllocatedSharedMemory<char> request_batch =
+          shm_pool_->Load<char>(ipc_message->Args());
+      RequestBatch* request_batch_shm_ptr =
+          reinterpret_cast<RequestBatch*>(request_batch.data_.get());
+      ProcessRequests(request_batch_shm_ptr);
+
+    } break;
+    case PYTHONSTUB_CommandType::PYTHONSTUB_FinalizeRequest:
+      ipc_message->Command() = PYTHONSTUB_FinalizeResponse;
+      // Clean up response_iterator_map_ before sending sending message back to
+      // the parent process to make sure that the clean up message can be
+      // processed before the message queue is destroyed.
+      {
+        std::lock_guard<std::mutex> lock(response_iterator_map_mu_);
+        std::unordered_map<void*, std::shared_ptr<ResponseIterator>>().swap(
+            response_iterator_map_);
+      }
+      SendIPCMessage(ipc_message);
+      return true;  // Terminate the stub process
+    case PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers:
+      try {
+        LoadGPUBuffers(ipc_message);
+      }
+      catch (const PythonBackendException& pb_exception) {
+        LOG_ERROR
+            << "An error occurred while trying to load GPU buffers in the "
+               "Python backend stub: "
+            << pb_exception.what() << std::endl;
+      }
+
+      break;
+    default:
+      break;
+  }
+
+  return false;
+}
+
+py::module
+Stub::StubSetup()
+{
+  py::module sys = py::module_::import("sys");
+
+  model_context_.StubSetup(sys);
+
+  py::module python_backend_utils =
+      py::module_::import("triton_python_backend_utils");
+  py::module c_python_backend_utils =
+      py::module_::import("c_python_backend_utils");
+  py::setattr(
+      python_backend_utils, "TritonError",
+      c_python_backend_utils.attr("TritonError"));
+  py::setattr(
+      python_backend_utils, "TritonModelException",
+      c_python_backend_utils.attr("TritonModelException"));
+  py::setattr(
+      python_backend_utils, "Tensor", c_python_backend_utils.attr("Tensor"));
+  py::setattr(
+      python_backend_utils, "InferenceRequest",
+      c_python_backend_utils.attr("InferenceRequest"));
+  py::setattr(
+      python_backend_utils, "InferenceResponse",
+      c_python_backend_utils.attr("InferenceResponse"));
+  py::setattr(
+      python_backend_utils, "Logger", c_python_backend_utils.attr("Logger"));
+  py::setattr(
+      python_backend_utils, "PreferredMemory",
+      c_python_backend_utils.attr("PreferredMemory"));
+  py::setattr(
+      python_backend_utils, "TRITONSERVER_MEMORY_GPU",
+      c_python_backend_utils.attr("TRITONSERVER_MEMORY_GPU"));
+  py::setattr(
+      python_backend_utils, "TRITONSERVER_MEMORY_CPU",
+      c_python_backend_utils.attr("TRITONSERVER_MEMORY_CPU"));
+  py::setattr(
+      python_backend_utils, "MetricFamily",
+      c_python_backend_utils.attr("MetricFamily"));
+  py::setattr(
+      python_backend_utils, "load_model",
+      c_python_backend_utils.attr("load_model"));
+  py::setattr(
+      python_backend_utils, "unload_model",
+      c_python_backend_utils.attr("unload_model"));
+  py::setattr(
+      python_backend_utils, "is_model_ready",
+      c_python_backend_utils.attr("is_model_ready"));
+
+  c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get());
+
+  deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor");
+  serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor");
+
+  return sys;
+}
 
 void
-SigtermHandler(int signum)
+Stub::AutoCompleteModelConfig(
+    bi::managed_external_buffer::handle_t string_handle,
+    std::string* auto_complete_config)
 {
-  sigterm_received = true;
+  py::module sys = StubSetup();
+
+  std::unique_ptr<PbString> pb_string_shm =
+      PbString::LoadFromSharedMemory(shm_pool_, string_handle);
+
+  py::module python_backend_utils =
+      py::module_::import("triton_python_backend_utils");
+  py::object model_config =
+      python_backend_utils.attr("ModelConfig")(pb_string_shm->String());
+  python_backend_utils.def(
+      "get_model_dir",
+      []() {
+        std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+        return stub->GetModelDir();
+      },
+      py::return_value_policy::reference);
+
+  if (py::hasattr(sys.attr("TritonPythonModel"), "auto_complete_config")) {
+    model_config = sys.attr("TritonPythonModel")
+                       .attr("auto_complete_config")(model_config);
+  }
+
+  if (!py::isinstance(model_config, python_backend_utils.attr("ModelConfig"))) {
+    throw PythonBackendException(
+        "auto_complete_config function in model '" + name_ +
+        "' must return a valid pb.ModelConfig object.");
+  }
+  py::module json = py::module_::import("json");
+  (*auto_complete_config) = std::string(
+      py::str(json.attr("dumps")(model_config.attr("_model_config"))));
 }
 
-class Stub {
-  bi::interprocess_mutex* stub_mutex_;
-  bi::interprocess_condition* stub_cond_;
-  bi::interprocess_mutex* parent_mutex_;
-  bi::interprocess_condition* parent_cond_;
-  bi::interprocess_mutex* health_mutex_;
-  bi::scoped_lock<bi::interprocess_mutex> stub_lock_;
-  std::string model_path_;
-  IPCMessage* ipc_message_;
-  std::unique_ptr<SharedMemory> shm_pool_;
-  py::object PyRequest_;
-  py::object PyTensor_;
-  py::object model_instance_;
-  py::object deserialize_bytes_;
-  py::object serialize_bytes_;
-  ResponseBatch* response_batch_;
+void
+Stub::Initialize(bi::managed_external_buffer::handle_t map_handle)
+{
+  py::module sys = StubSetup();
+
+  py::module python_backend_utils =
+      py::module_::import("triton_python_backend_utils");
+  py::module c_python_backend_utils =
+      py::module_::import("c_python_backend_utils");
+  py::setattr(
+      python_backend_utils, "TritonError",
+      c_python_backend_utils.attr("TritonError"));
+  py::setattr(
+      python_backend_utils, "TritonModelException",
+      c_python_backend_utils.attr("TritonModelException"));
+  py::setattr(
+      python_backend_utils, "Tensor", c_python_backend_utils.attr("Tensor"));
+  py::setattr(
+      python_backend_utils, "InferenceRequest",
+      c_python_backend_utils.attr("InferenceRequest"));
+  py::setattr(
+      python_backend_utils, "InferenceResponse",
+      c_python_backend_utils.attr("InferenceResponse"));
+  c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get());
+
+  async_event_loop_ = py::none();
+  background_futures_ = py::set();
+
+  py::object TritonPythonModel = sys.attr("TritonPythonModel");
+  deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor");
+  serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor");
+  python_backend_utils.def(
+      "get_model_dir",
+      []() {
+        std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+        return stub->GetModelDir();
+      },
+      py::return_value_policy::reference);
+  model_instance_ = TritonPythonModel();
+
+  std::unordered_map<std::string, std::string> map;
+  std::unique_ptr<PbMap> pb_map_shm =
+      PbMap::LoadFromSharedMemory(shm_pool_, map_handle);
+
+  // Get the unordered_map representation of the map in shared memory.
+  map = pb_map_shm->UnorderedMap();
+
+  py::dict model_config_params;
+
+  for (const auto& pair : map) {
+    model_config_params[pair.first.c_str()] = pair.second;
+  }
 
- public:
-  Stub(
-      int64_t shm_growth_size, int64_t shm_default_size,
-      std::string& shm_region_name, std::string& model_path)
-  {
-    try {
-      model_path_ = model_path;
-      stub_mutex_ = nullptr;
-      stub_cond_ = nullptr;
-      parent_mutex_ = nullptr;
-      parent_cond_ = nullptr;
-      health_mutex_ = nullptr;
-
-      shm_pool_ = std::make_unique<SharedMemory>(
-          shm_region_name, shm_default_size, shm_growth_size);
-
-      // Stub mutex and CV
-      bi::interprocess_mutex* stub_mutex;
-      off_t stub_mutex_offset;
-      shm_pool_->Map(
-          (char**)&stub_mutex, sizeof(bi::interprocess_mutex),
-          stub_mutex_offset);
-
-      bi::interprocess_condition* stub_cv;
-      off_t stub_cv_offset;
-      shm_pool_->Map(
-          (char**)&stub_cv, sizeof(bi::interprocess_condition), stub_cv_offset);
-
-      stub_cond_ = stub_cv;
-      stub_mutex_ = stub_mutex;
-
-      // Parent Mutex and CV
-      bi::interprocess_mutex* parent_mutex;
-      off_t parent_mutex_offset;
-      shm_pool_->Map(
-          (char**)&parent_mutex, sizeof(bi::interprocess_mutex),
-          parent_mutex_offset);
-
-      bi::interprocess_condition* parent_cv;
-      off_t parent_cv_offset;
-      shm_pool_->Map(
-          (char**)&parent_cv, sizeof(bi::interprocess_condition),
-          parent_cv_offset);
-
-      bi::interprocess_mutex* health_mutex;
-      off_t health_mutex_offset;
-      shm_pool_->Map(
-          (char**)&health_mutex, sizeof(bi::interprocess_mutex),
-          health_mutex_offset);
-
-      health_mutex_ = health_mutex;
-      parent_mutex_ = parent_mutex;
-      parent_cond_ = parent_cv;
-
-      IPCMessage* ipc_message;
-      off_t ipc_offset;
-      shm_pool_->Map((char**)&ipc_message, sizeof(IPCMessage), ipc_offset);
-
-      off_t response_batch_offset;
-      shm_pool_->Map(
-          (char**)&response_batch_, sizeof(Response), response_batch_offset);
-      ipc_message->response_batch = response_batch_offset;
-      response_batch_->has_error = false;
-      ipc_message_ = ipc_message;
-
-      stub_lock_ = bi::scoped_lock<bi::interprocess_mutex>(*stub_mutex_);
-      NotifyParent();
-    }
-    catch (const PythonBackendException& pb_exception) {
-      LOG_INFO << pb_exception.what() << std::endl;
-      exit(1);
-    }
-  }
-
-  void NotifyParent()
-  {
-    if (parent_mutex_ == nullptr || parent_cond_ == nullptr) {
-      LOG_INFO << "Parent process mutex and conditional variable is not "
-                  "initialized. "
-               << "Exiting..";
-      exit(1);
-    }
+  LaunchStubToParentQueueMonitor();
+  LaunchParentToStubQueueMonitor();
+
+  // Call initialize if exists.
+  if (py::hasattr(model_instance_, "initialize")) {
+    model_instance_.attr("initialize")(model_config_params);
+  }
+
+  initialized_ = true;
+}
+
+void
+Stub::LoadGPUBuffers(std::unique_ptr<IPCMessage>& ipc_message)
+{
+  ScopedDefer load_gpu_buffer_response([this] {
+    // LoadGPUBuffers must let the parent process know when loading the
+    // buffers have been finished.
+    parent_message_queue_->Push(DUMMY_MESSAGE);
+    gpu_tensors_.clear();
+  });
+
+  AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_handle =
+      shm_pool_->Load<GPUBuffersShm>(ipc_message->Args());
+
+  if (!gpu_buffers_handle.data_->success) {
+    std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
+        shm_pool_, gpu_buffers_handle.data_->error);
+    throw PythonBackendException(
+        "Failed to load GPU buffers: " + error->String());
+  }
 
-    bi::scoped_lock<bi::interprocess_mutex> lk(*parent_mutex_);
-    parent_cond_->notify_one();
+  uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count;
+  AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
+      gpu_buffers_handle_shm =
+          shm_pool_->Load<bi::managed_external_buffer::handle_t>(
+              gpu_buffers_handle.data_->buffers);
+
+  if (gpu_tensors_.size() != gpu_buffer_count) {
+    throw PythonBackendException(
+        std::string("GPU buffers size does not match the provided buffers: ") +
+        std::to_string(gpu_tensors_.size()) +
+        " != " + std::to_string(gpu_buffer_count));
   }
 
-  bool& Health() { return ipc_message_->health; }
+  std::vector<std::unique_ptr<PbMemory>> dst_buffers;
+  for (size_t i = 0; i < gpu_tensors_.size(); i++) {
+    std::unique_ptr<PbMemory> dst_buffer = PbMemory::LoadFromSharedMemory(
+        shm_pool_, gpu_buffers_handle_shm.data_.get()[i],
+        true /* open_cuda_handle */);
+    dst_buffers.emplace_back(std::move(dst_buffer));
+  }
 
-  std::unique_ptr<SharedMemory>& GetSharedMemory() { return shm_pool_; }
+  for (size_t i = 0; i < gpu_tensors_.size(); i++) {
+    std::shared_ptr<PbTensor>& src_buffer = gpu_tensors_[i];
+    PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory());
+  }
+}
 
-  void SetErrorForResponse(Response* response, const char* err_message)
-  {
-    off_t err_string_offset = 0;
-    response->is_error_set = false;
-    response->has_error = true;
-    LOG_IF_EXCEPTION(
-        SaveStringToSharedMemory(shm_pool_, err_string_offset, err_message));
+py::list
+Stub::LoadRequestsFromSharedMemory(RequestBatch* request_batch_shm_ptr)
+{
+  uint32_t batch_size = request_batch_shm_ptr->batch_size;
+  py::list py_request_list;
 
-    if (err_string_offset != 0) {
-      response->error = err_string_offset;
-      response->is_error_set = true;
-    }
+  if (batch_size == 0) {
+    return py_request_list;
   }
 
-  void SetErrorForResponseBatch(const char* err_message)
-  {
-    off_t err_string_offset = 0;
-    response_batch_->is_error_set = false;
-    response_batch_->has_error = true;
-    LOG_IF_EXCEPTION(
-        SaveStringToSharedMemory(shm_pool_, err_string_offset, err_message));
+  bi::managed_external_buffer::handle_t* request_shm_handle =
+      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+          reinterpret_cast<char*>(request_batch_shm_ptr) +
+          sizeof(RequestBatch));
+
+  for (size_t i = 0; i < batch_size; i++) {
+    std::shared_ptr<InferRequest> infer_request =
+        InferRequest::LoadFromSharedMemory(
+            shm_pool_, request_shm_handle[i], true /* open_cuda_handle */,
+            &ipc_control_->decoupled /* is_model_decoupled */);
+    py_request_list.append(infer_request);
+  }
 
-    if (err_string_offset != 0) {
-      response_batch_->error = err_string_offset;
-      response_batch_->is_error_set = true;
+  return py_request_list;
+}
+
+void
+Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr)
+{
+  py::list py_request_list =
+      LoadRequestsFromSharedMemory(request_batch_shm_ptr);
+  std::unique_ptr<IPCMessage> execute_response;
+
+  std::optional<AllocatedSharedMemory<char>> response_batch;
+  bool has_exception = false;
+  std::string error_string;
+  std::unique_ptr<PbString> error_string_shm;
+  std::string err_message;
+
+  ScopedDefer execute_finalize([this] { stub_message_queue_->Pop(); });
+  ScopedDefer _(
+      [this, &execute_response] { SendIPCMessage(execute_response); });
+  py::object execute_return;
+  py::object coroutine_return;
+  try {
+    if (!py::hasattr(model_instance_, "execute")) {
+      std::string message = "Python model " + model_context_.PythonModelPath() +
+                            " does not implement `execute` method.";
+      throw PythonBackendException(message);
     }
-  }
 
-  void ProcessResponse(
-      Response* response_shm, ResponseBatch* response_batch,
-      py::handle response, py::object& serialize_bytes)
-  {
-    // Initialize has_error to false
-    response_shm->has_error = false;
-
-    py::bool_ py_has_error = response.attr("has_error")();
-    bool has_error = py_has_error;
-
-    if (has_error) {
-      py::str py_string_err = py::str(response.attr("error")());
-      std::string response_error = py_string_err;
-      SetErrorForResponse(response_shm, response_error.c_str());
-
-      // Skip the response value when the response has error.
-      return;
-    }
-
-    py::list output_tensors = response.attr("output_tensors")();
-    size_t output_tensor_length = py::len(output_tensors);
-
-    size_t j = 0;
-    Tensor* output_tensors_shm;
-    off_t output_tensors_offset;
-    shm_pool_->Map(
-        (char**)&output_tensors_shm, sizeof(Tensor) * output_tensor_length,
-        output_tensors_offset);
-    response_shm->outputs = output_tensors_offset;
-    response_shm->outputs_size = output_tensor_length;
-
-    for (auto& output_tensor : output_tensors) {
-      Tensor* output_tensor_shm = &output_tensors_shm[j];
-      py::str name = output_tensor.attr("name")();
-      std::string output_name = name;
-
-      py::array numpy_array = output_tensor.attr("as_numpy")();
-      py::int_ dtype = output_tensor.attr("triton_dtype")();
-      py::buffer_info buffer = numpy_array.request();
-
-      int dtype_triton_int = dtype;
-      TRITONSERVER_DataType dtype_triton =
-          static_cast<TRITONSERVER_DataType>(dtype_triton_int);
-
-      char* data_in_shm;
-      char* data_ptr;
-      const TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
-      const int memory_type_id = 0;
-
-      size_t dims_count = numpy_array.ndim();
-      int64_t dims[dims_count];
-      ssize_t byte_size;
-
-      // Custom handling for type bytes.
-      if (dtype_triton == TRITONSERVER_TYPE_BYTES) {
-        py::object serialized_bytes_or_none = serialize_bytes(numpy_array);
-        if (serialize_bytes.is_none()) {
-          const char* err_message = "An error happened during serialization.";
-          LOG_INFO << err_message;
-          SetErrorForResponse(response_shm, err_message);
-          return;
-        }
+    {
+      NVTX_RANGE(nvtx_, "PyExecute " + name_);
 
-        py::bytes serialized_bytes = serialized_bytes_or_none;
-        data_ptr = PyBytes_AsString(serialized_bytes.ptr());
-        byte_size = PyBytes_Size(serialized_bytes.ptr());
+      execute_return = model_instance_.attr("execute")(py_request_list);
+
+      bool is_coroutine = py::module::import("asyncio")
+                              .attr("iscoroutine")(execute_return)
+                              .cast<bool>();
+      if (is_coroutine) {
+        if (IsDecoupled()) {
+          // Do not wait for async decoupled execute to return.
+          RunCoroutine(execute_return, true /* in_background */);
+        } else {
+          coroutine_return =
+              RunCoroutine(execute_return, false /* in_background */);
+          ProcessReturnedResponses(
+              py_request_list, coroutine_return, response_batch);
+        }
       } else {
-        data_ptr = static_cast<char*>(buffer.ptr);
-        byte_size = numpy_array.nbytes();
+        ProcessReturnedResponses(
+            py_request_list, execute_return, response_batch);
       }
+    }
+  }
+  catch (const PythonBackendException& pb_exception) {
+    has_exception = true;
+    error_string = pb_exception.what();
+  }
+  catch (const py::error_already_set& error) {
+    has_exception = true;
+    error_string = error.what();
+  }
 
-      const ssize_t* numpy_shape = numpy_array.shape();
-      for (size_t i = 0; i < dims_count; i++) {
-        dims[i] = numpy_shape[i];
+  if (has_exception) {
+    err_message = std::string(
+                      "Failed to process the request(s) for model '" + name_ +
+                      "', message: ") +
+                  error_string;
+    LOG_ERROR << err_message.c_str();
+    if (!response_batch) {
+      response_batch = shm_pool_->Construct<char>(
+          sizeof(ResponseBatch) + sizeof(IPCMessageShm));
+    }
+    ResponseBatch* response_batch_shm_ptr = reinterpret_cast<ResponseBatch*>(
+        response_batch.value().data_.get() + sizeof(IPCMessageShm));
+
+    // The backend will clean up the response factory if there is an error in
+    // the response batch. For decoupled mode, it is necessary to handle cases
+    // where the response sender should have already cleaned up, ensuring the
+    // backend does not delete the response factory again during error handling.
+    if (IsDecoupled()) {
+      for (py::handle py_request : py_request_list) {
+        InferRequest* request = py_request.cast<InferRequest*>();
+        if (request->GetResponseSender()->IsClosed()) {
+          response_batch_shm_ptr->is_response_factory_deleted = true;
+        }
       }
+    }
 
-      SaveTensorToSharedMemory(
-          shm_pool_, output_tensor_shm, data_in_shm, memory_type,
-          memory_type_id, byte_size, output_name.c_str(), dims, dims_count,
-          dtype_triton);
+    response_batch_shm_ptr->has_error = true;
+    error_string_shm = PbString::Create(shm_pool_, err_message);
+    response_batch_shm_ptr->error = error_string_shm->ShmHandle();
+    response_batch_shm_ptr->is_error_set = true;
+    response_batch_shm_ptr->batch_size = 0;
+    // Once the error is sent to the backend, the backend is supposed to close
+    // all response factories if not already closed, so closing all response
+    // senders if not already closed to prevent the model from sending more
+    // responses after the factories are closed.
+    for (py::handle py_request : py_request_list) {
+      InferRequest* request = py_request.cast<InferRequest*>();
+      request->GetResponseSender()->Close();
+    }
+  } else {
+    if (!response_batch) {
+      response_batch = shm_pool_->Construct<char>(
+          sizeof(ResponseBatch) + sizeof(IPCMessageShm));
+      ResponseBatch* response_batch_shm_ptr = reinterpret_cast<ResponseBatch*>(
+          response_batch.value().data_.get() + sizeof(IPCMessageShm));
+      response_batch_shm_ptr->batch_size = 0;
+    }
+    ResponseBatch* response_batch_shm_ptr = reinterpret_cast<ResponseBatch*>(
+        response_batch.value().data_.get() + sizeof(IPCMessageShm));
+    response_batch_shm_ptr->has_error = false;
+    response_batch_shm_ptr->is_error_set = false;
+  }
 
-      // TODO: We can remove this memcpy if the numpy object
-      // is already in shared memory.
-      std::copy(data_ptr, data_ptr + byte_size, data_in_shm);
-      j += 1;
+  execute_response = IPCMessage::Create(
+      reinterpret_cast<IPCMessageShm*>(response_batch.value().data_.get()),
+      response_batch.value().handle_);
+  execute_response->Args() =
+      response_batch.value().handle_ + sizeof(IPCMessageShm);
+  execute_response->InlineResponse() = false;
+  execute_response->Command() = PYTHONSTUB_ExecuteResponse;
+  _.Complete();
+  execute_finalize.Complete();
+}
+
+void
+Stub::ProcessResponse(InferResponse* response)
+{
+  response->SaveToSharedMemory(shm_pool_, false /* copy_gpu */);
+
+  for (auto& output_tensor : response->OutputTensors()) {
+    if (!output_tensor->IsCPU()) {
+      gpu_tensors_.push_back(output_tensor);
     }
   }
+}
 
-  void ProcessRequest(
-      Request* request, ResponseBatch* response_batch,
-      py::object& infer_request, py::object& PyRequest, py::object& PyTensor,
-      py::object& deserialize_bytes)
-  {
-    char* id = nullptr;
-    LoadStringFromSharedMemory(shm_pool_, request->id, id);
-
-    uint32_t requested_input_count = request->requested_input_count;
-    Tensor* input_tensors;
-    shm_pool_->MapOffset(
-        (char**)&input_tensors, sizeof(Tensor) * requested_input_count,
-        request->inputs);
-
-    py::list py_input_tensors;
-    for (size_t input_idx = 0; input_idx < requested_input_count; ++input_idx) {
-      Tensor* input_tensor = &input_tensors[input_idx];
-
-      char* name = nullptr;
-      LoadStringFromSharedMemory(shm_pool_, input_tensor->name, name);
-
-      RawData* raw_data;
-      shm_pool_->MapOffset(
-          (char**)&raw_data, sizeof(RawData), input_tensor->raw_data);
-
-      char* data;
-      shm_pool_->MapOffset(
-          (char**)&data, raw_data->byte_size, raw_data->memory_ptr);
-
-      size_t dims_count = input_tensor->dims_count;
-
-      int64_t* dims;
-      shm_pool_->MapOffset(
-          (char**)&dims, sizeof(int64_t) * dims_count, input_tensor->dims);
-
-      TRITONSERVER_DataType dtype = input_tensor->dtype;
-      std::vector<int64_t> shape{dims, dims + dims_count};
-      py::dtype dtype_numpy;
-      switch (dtype) {
-        case TRITONSERVER_TYPE_BOOL:
-          dtype_numpy = py::dtype(py::format_descriptor<bool>::format());
-          break;
-        case TRITONSERVER_TYPE_UINT8:
-          dtype_numpy = py::dtype(py::format_descriptor<uint8_t>::format());
-          break;
-        case TRITONSERVER_TYPE_UINT16:
-          dtype_numpy = py::dtype(py::format_descriptor<uint16_t>::format());
-          break;
-        case TRITONSERVER_TYPE_UINT32:
-          dtype_numpy = py::dtype(py::format_descriptor<uint32_t>::format());
-          break;
-        case TRITONSERVER_TYPE_UINT64:
-          dtype_numpy = py::dtype(py::format_descriptor<uint64_t>::format());
-          break;
-        case TRITONSERVER_TYPE_INT8:
-          dtype_numpy = py::dtype(py::format_descriptor<int8_t>::format());
-          break;
-        case TRITONSERVER_TYPE_INT16:
-          dtype_numpy = py::dtype(py::format_descriptor<int16_t>::format());
-          break;
-        case TRITONSERVER_TYPE_INT32:
-          dtype_numpy = py::dtype(py::format_descriptor<int32_t>::format());
-          break;
-        case TRITONSERVER_TYPE_INT64:
-          dtype_numpy = py::dtype(py::format_descriptor<int64_t>::format());
-          break;
-        case TRITONSERVER_TYPE_FP16:
-          // Will be reinterpreted in the python code.
-          dtype_numpy = py::dtype(py::format_descriptor<uint16_t>::format());
-          break;
-        case TRITONSERVER_TYPE_FP32:
-          dtype_numpy = py::dtype(py::format_descriptor<float>::format());
-          break;
-        case TRITONSERVER_TYPE_FP64:
-          dtype_numpy = py::dtype(py::format_descriptor<double>::format());
-          break;
-        case TRITONSERVER_TYPE_BYTES:
-          // Will be reinterpreted in the python code.
-          dtype_numpy = py::dtype(py::format_descriptor<uint8_t>::format());
-          break;
-        default:
-          break;
+void
+Stub::ProcessReturnedResponses(
+    py::list py_requests, py::object py_responses_obj,
+    std::optional<AllocatedSharedMemory<char>>& response_batch)
+{
+  // Return if there is nothing to process.
+  if (py::isinstance<py::none>(py_responses_obj)) {
+    return;
+  }
+  // Only non-decoupled may return responses.
+  if (IsDecoupled()) {
+    throw PythonBackendException(
+        "Python model '" + name_ +
+        "' is using the decoupled mode and the execute function must return "
+        "None.");
+  }
+  // Check responses is a list.
+  if (!py::isinstance<py::list>(py_responses_obj)) {
+    throw PythonBackendException(
+        "Expected a list in the execute return, found type '" +
+        std::string(py::str(py_responses_obj.get_type())) + "'.");
+  }
+  py::list py_responses = py_responses_obj;
+  // Responses and requests length must match.
+  size_t requests_size = py::len(py_requests);
+  size_t responses_size = py::len(py_responses);
+  if (requests_size != responses_size) {
+    throw PythonBackendException(
+        "Number of InferenceResponse objects do not match the number of "
+        "InferenceRequest objects. InferenceRequest(s) size is:" +
+        std::to_string(requests_size) + ", and InferenceResponse(s) size is:" +
+        std::to_string(responses_size) + "\n");
+  }
+
+  for (size_t i = 0; i < responses_size; i++) {
+    if (!py::isinstance<py::none>(py_responses[i])) {
+      InferRequest* request = py_requests[i].cast<InferRequest*>();
+      // Response must be None if rescheduled.
+      if (request->ReleaseFlags() == TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) {
+        throw PythonBackendException(
+            "Expected a None object in the execute function return list for "
+            "reschduled request, found type '" +
+            std::string(py::str(py_responses[i].get_type())) + "'.");
+      }
+      // Send the response.
+      if (!py::isinstance<InferResponse>(py_responses[i])) {
+        throw PythonBackendException(
+            "Expected an 'InferenceResponse' object in the execute function "
+            "return list, found type '" +
+            std::string(py::str(py_responses[i].get_type())) + "'.");
       }
 
+      InferResponse* response = py_responses[i].cast<InferResponse*>();
       try {
-        // Custom handling for bytes
-        if (dtype == TRITONSERVER_TYPE_BYTES) {
-          py::array numpy_array(
-              dtype_numpy, {raw_data->byte_size}, (void*)data);
-          py::list dims = py::cast(shape);
-
-          py::object deserialized =
-              deserialize_bytes(numpy_array).attr("reshape")(dims);
-
-          py::object py_input_tensor =
-              PyTensor(name, deserialized, static_cast<int>(dtype));
-          py_input_tensors.append(py_input_tensor);
-        } else {
-          py::array numpy_array(dtype_numpy, shape, (void*)data);
-          py::object py_input_tensor =
-              PyTensor(name, numpy_array, static_cast<int>(dtype));
-          py_input_tensors.append(py_input_tensor);
+        request->GetResponseSender()->UpdateStateAndCounters(
+            response, TRITONSERVER_RESPONSE_COMPLETE_FINAL);
+      }
+      catch (const PythonBackendException& pb_exception) {
+        // Handle the exception here to catch the error when there's a response
+        // returned from `execute()`.
+        if (request->GetResponseSender()->IsClosed()) {
+          response_batch = std::move(shm_pool_->Construct<char>(
+              sizeof(ResponseBatch) + sizeof(IPCMessageShm)));
+          ResponseBatch* response_batch_shm_ptr =
+              reinterpret_cast<ResponseBatch*>(
+                  response_batch.value().data_.get() + sizeof(IPCMessageShm));
+          response_batch_shm_ptr->batch_size = 0;
+          response_batch_shm_ptr->is_response_factory_deleted = true;
         }
+        throw pb_exception;
+      }
+    }
+  }
+  // Return all the created responses using response_batch. The reason
+  // that both of the paths are available is that sending the responses
+  // using response_batch is faster than using `response_sender`.
+  response_batch = std::move(shm_pool_->Construct<char>(
+      sizeof(IPCMessageShm) +
+      requests_size * sizeof(bi::managed_external_buffer::handle_t) +
+      sizeof(ResponseBatch)));
+  ResponseBatch* response_batch_shm_ptr = reinterpret_cast<ResponseBatch*>(
+      response_batch.value().data_.get() + sizeof(IPCMessageShm));
+
+  bi::managed_external_buffer::handle_t* responses_shm_handle =
+      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+          response_batch.value().data_.get() + sizeof(ResponseBatch) +
+          sizeof(IPCMessageShm));
+  for (size_t i = 0; i < responses_size; i++) {
+    // Check the return type of execute function.
+    InferRequest* infer_request = py_requests[i].cast<InferRequest*>();
+    InferResponse* infer_response = py_responses[i].cast<InferResponse*>();
+    if (!py::isinstance<py::none>(py_responses[i])) {
+      infer_response->PruneOutputTensors(infer_request->RequestedOutputNames());
+      ProcessResponse(infer_response);
+      responses_shm_handle[i] = infer_response->ShmHandle();
+    } else {
+      responses_shm_handle[i] = 0;
+    }
+  }
+  response_batch_shm_ptr->batch_size = requests_size;
+}
+
+py::object
+Stub::GetAsyncEventLoop()
+{
+  if (py::isinstance<py::none>(async_event_loop_)) {
+    // Create the event loop if not already.
+    py::module asyncio = py::module_::import("asyncio");
+    async_event_loop_ = asyncio.attr("new_event_loop")();
+    asyncio.attr("set_event_loop")(async_event_loop_);
+    py::object py_thread =
+        py::module_::import("threading")
+            .attr("Thread")(
+                "target"_a = async_event_loop_.attr("run_forever"),
+                "daemon"_a = true);
+    py_thread.attr("start")();
+  }
+  return async_event_loop_;
+}
+
+py::object
+Stub::RunCoroutine(py::object coroutine, bool in_background)
+{
+  py::object loop = GetAsyncEventLoop();
+  py::object py_future = py::module_::import("asyncio").attr(
+      "run_coroutine_threadsafe")(coroutine, loop);
+  if (in_background) {
+    py_future.attr("add_done_callback")(
+        py::module_::import("c_python_backend_utils")
+            .attr("async_event_future_done_callback"));
+    background_futures_.attr("add")(py_future);
+    return py::none();
+  }
+  return py_future.attr("result")();
+}
+
+void
+Stub::BackgroundFutureDone(const py::object& py_future)
+{
+  ScopedDefer _([this, &py_future] {
+    // Remove future from background
+    try {
+      background_futures_.attr("remove")(py_future);
+    }
+    catch (const py::error_already_set& error) {
+      LOG_ERROR << "Cannot remove future from background; " << error.what();
+    }
+  });
+  // TODO: Why using `py_future.result()` with error hangs on exit?
+  try {
+    py::object exception = py_future.attr("exception")();
+    if (!py::isinstance<py::none>(exception)) {
+      std::string err_msg = "";
+      py::object traceback = py::module_::import("traceback")
+                                 .attr("TracebackException")
+                                 .attr("from_exception")(exception)
+                                 .attr("format")();
+      for (py::handle line : traceback) {
+        err_msg += py::str(line);
+      }
+      LOG_ERROR << err_msg;
+    }
+  }
+  catch (const PythonBackendException& pb_exception) {
+    LOG_ERROR << pb_exception.what();
+  }
+  catch (const py::error_already_set& error) {
+    LOG_ERROR << error.what();
+  }
+}
+
+void
+Stub::UpdateHealth()
+{
+  bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_);
+  ipc_control_->stub_health = true;
+}
+
+void
+Stub::Finalize()
+{
+  finalizing_ = true;
+  if (initialized_) {
+    // Stop async event loop if created.
+    if (!py::isinstance<py::none>(async_event_loop_)) {
+      async_event_loop_.attr("stop")();
+    }
+    // Call finalize if exists.
+    if (py::hasattr(model_instance_, "finalize")) {
+      try {
+        model_instance_.attr("finalize")();
       }
       catch (const py::error_already_set& e) {
         LOG_INFO << e.what();
-        throw PythonBackendException(e.what());
-        return;
       }
     }
+  }
+#ifdef TRITON_ENABLE_GPU
+  // We also need to destroy created proxy CUDA streams for dlpack, if any
+  std::lock_guard<std::mutex> lock(dlpack_proxy_stream_pool_mu_);
+  for (auto& entry : dlpack_proxy_stream_pool_) {
+    // We don't need to switch device to destroy a stream
+    // https://stackoverflow.com/questions/64663943/how-to-destroy-a-stream-that-was-created-on-a-specific-device
+    cudaError_t err = cudaStreamDestroy(entry.second);
+    if (err != cudaSuccess) {
+      LOG_ERROR
+          << "Failed to destroy dlpack CUDA proxy stream on device with id " +
+                 std::to_string(entry.first);
+    }
+  }
+#endif
+}
 
-    py::list py_requested_output_names;
+void
+Stub::SendIPCMessage(std::unique_ptr<IPCMessage>& ipc_message)
+{
+  bool success = false;
+  while (!success) {
+    parent_message_queue_->Push(ipc_message->ShmHandle(), 1000, success);
+  }
+}
 
-    uint32_t requested_output_count = request->requested_output_count;
-    off_t* output_names;
-    shm_pool_->MapOffset(
-        (char**)&output_names, sizeof(off_t) * requested_output_count,
-        request->requested_output_names);
+void
+Stub::SendIPCUtilsMessage(std::unique_ptr<IPCMessage>& ipc_message)
+{
+  bool success = false;
+  while (!success) {
+    stub_to_parent_mq_->Push(ipc_message->ShmHandle(), 1000, success);
+  }
+}
 
-    for (size_t output_idx = 0; output_idx < requested_output_count;
-         ++output_idx) {
-      char* output_name = nullptr;
-      LoadStringFromSharedMemory(
-          shm_pool_, output_names[output_idx], output_name);
-      py_requested_output_names.append(output_name);
+Stub::~Stub()
+{
+#ifdef TRITON_ENABLE_GPU
+  try {
+    if (shm_pool_ != nullptr) {
+      CUDAHandler& cuda_api = CUDAHandler::getInstance();
+      for (auto& m :
+           shm_pool_->GetCUDAMemoryPoolManager()->CUDAPoolAddressMap()) {
+        if (m.second != nullptr) {
+          cuda_api.CloseCudaHandle(m.first, m.second);
+        }
+      }
     }
+  }
+  catch (const PythonBackendException& pb_exception) {
+    std::cerr << "Error when closing CUDA handle: " << pb_exception.what();
+  }
+#endif
+
+  // Ensure the interpreter is active before trying to clean up.
+  if (Py_IsInitialized()) {
+    py::gil_scoped_acquire acquire;
+    py::object async_event_loop_local(std::move(async_event_loop_));
+    py::object background_futures_local(std::move(background_futures_));
+    py::object model_instance_local(std::move(model_instance_));
+  }
+
+  stub_message_queue_.reset();
+  parent_message_queue_.reset();
+  stub_to_parent_mq_.reset();
+  memory_manager_message_queue_.reset();
+}
 
-    infer_request = PyRequest(
-        py_input_tensors, id, request->correlation_id,
-        py_requested_output_names);
+std::unique_ptr<Stub> Stub::stub_instance_;
+
+std::unique_ptr<Stub>&
+Stub::GetOrCreateInstance()
+{
+  if (Stub::stub_instance_.get() == nullptr) {
+    Stub::stub_instance_ = std::make_unique<Stub>();
   }
 
-  void SetResponseFromException(const PythonBackendException& pb_exception)
+  return Stub::stub_instance_;
+}
+
+void
+Stub::LaunchStubToParentQueueMonitor()
+{
+  stub_to_parent_thread_ = true;
+  stub_to_parent_queue_monitor_ =
+      std::thread(&Stub::ServiceStubToParentRequests, this);
+  Logger::GetOrCreateInstance()->SetBackendLoggingActive(true);
+}
+
+void
+Stub::TerminateStubToParentQueueMonitor()
+{
+  Logger::GetOrCreateInstance()->SetBackendLoggingActive(false);
   {
-    SetErrorForResponseBatch(pb_exception.what());
+    std::lock_guard<std::mutex> guard{stub_to_parent_message_mu_};
+    // Push a dummy message to signal the thread to terminate.
+    stub_to_parent_buffer_.push(DUMMY_MESSAGE);
   }
+  stub_to_parent_message_cv_.notify_one();
+  stub_to_parent_queue_monitor_.join();
+}
 
-  int Execute()
-  {
-    // Reset the value for has_error
-    response_batch_->has_error = false;
+void
+Stub::EnqueueLogRequest(std::unique_ptr<PbLog>& log_ptr)
+{
+  std::unique_ptr<UtilsMessagePayload> utils_msg_payload =
+      std::make_unique<UtilsMessagePayload>(
+          PYTHONSTUB_LogRequest, reinterpret_cast<void*>(log_ptr.release()));
+  EnqueueUtilsMessage(std::move(utils_msg_payload));
+}
 
-    RequestBatch* request_batch;
-    try {
-      shm_pool_->MapOffset(
-          (char**)&request_batch, sizeof(RequestBatch),
-          ipc_message_->request_batch);
+void
+Stub::ServiceStubToParentRequests()
+{
+  while (stub_to_parent_thread_) {
+    std::unique_lock<std::mutex> guard{stub_to_parent_message_mu_};
+    while (stub_to_parent_buffer_.empty()) {
+      stub_to_parent_message_cv_.wait(guard);
     }
-    catch (const PythonBackendException& pb_exception) {
-      LOG_EXCEPTION(pb_exception);
-      SetResponseFromException(pb_exception);
-      return 0;
+    // On exit, will send messages to the parent process until
+    // DUMMY_MESSAGE is reached
+    std::unique_ptr<UtilsMessagePayload> utils_msg_payload =
+        std::move(stub_to_parent_buffer_.front());
+    if (utils_msg_payload == DUMMY_MESSAGE) {
+      stub_to_parent_buffer_.pop();
+      break;
+    } else {
+      stub_to_parent_buffer_.pop();
+      if (utils_msg_payload->command_type == PYTHONSTUB_LogRequest) {
+        SendLogMessage(utils_msg_payload);
+      } else if (
+          (utils_msg_payload->command_type ==
+           PYTHONSTUB_BLSDecoupledInferPayloadCleanup) ||
+          (utils_msg_payload->command_type ==
+           PYTHONSTUB_DecoupledResponseFactoryCleanup)) {
+        SendCleanupId(utils_msg_payload, utils_msg_payload->command_type);
+      } else if (
+          utils_msg_payload->command_type == PYTHONSTUB_IsRequestCancelled) {
+        SendIsCancelled(utils_msg_payload);
+      } else if (
+          utils_msg_payload->command_type == PYTHONSTUB_CancelBLSInferRequest) {
+        SendCancelBLSRequest(utils_msg_payload);
+      } else {
+        std::cerr << "Error when sending message via stub_to_parent message "
+                     "buffer - unknown command\n";
+      }
     }
-    uint32_t batch_size = request_batch->batch_size;
+  }
+}
 
-    // An empty batch size indicates termination
-    if (batch_size == 0) {
-      return 1;
+void
+Stub::SendLogMessage(std::unique_ptr<UtilsMessagePayload>& utils_msg_payload)
+{
+  std::unique_ptr<PbLog> log_send_message = std::unique_ptr<PbLog>(
+      reinterpret_cast<PbLog*>(utils_msg_payload->utils_message_ptr));
+
+  std::unique_ptr<PbLogShm> log_request_shm = PbLogShm::Create(
+      shm_pool_, log_send_message->Filename(), log_send_message->Line(),
+      log_send_message->Message(), log_send_message->Level());
+  LogSendMessage* send_message_payload = log_request_shm->LogMessage();
+  send_message_payload->waiting_on_stub = false;
+  std::unique_ptr<IPCMessage> log_request_msg =
+      IPCMessage::Create(shm_pool_, false /* inline_response */);
+  log_request_msg->Args() = log_request_shm->ShmHandle();
+  log_request_msg->Command() = PYTHONSTUB_LogRequest;
+  ScopedDefer _([send_message_payload] {
+    {
+      bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
+      send_message_payload->waiting_on_stub = false;
+      send_message_payload->cv.notify_all();
     }
+  });
 
-    Request* requests;
-    try {
-      shm_pool_->MapOffset(
-          (char**)&requests, sizeof(Request) * batch_size,
-          request_batch->requests);
+  {
+    // Send a message to be caught by the log monitor thread in python_be.cc
+    bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
+    SendIPCUtilsMessage(log_request_msg);
+    while (!send_message_payload->waiting_on_stub) {
+      send_message_payload->cv.wait(guard);
     }
-    catch (const PythonBackendException& pb_exception) {
-      LOG_EXCEPTION(pb_exception);
-      SetResponseFromException(pb_exception);
-      return 0;
+  }
+}
+
+void
+Stub::SendCleanupId(
+    std::unique_ptr<UtilsMessagePayload>& utils_msg_payload,
+    const PYTHONSTUB_CommandType& command_type)
+{
+  void* id = utils_msg_payload->utils_message_ptr;
+  if (command_type == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) {
+    std::lock_guard<std::mutex> lock(response_iterator_map_mu_);
+    response_iterator_map_.erase(id);
+  }
+
+  std::unique_ptr<IPCMessage> ipc_message =
+      IPCMessage::Create(shm_pool_, true /* inline_response */);
+  ipc_message->Command() = command_type;
+  AllocatedSharedMemory<char> cleanup_request_message =
+      shm_pool_->Construct<char>(
+          sizeof(CleanupMessage) +
+          sizeof(bi::managed_external_buffer::handle_t));
+  CleanupMessage* cleanup_message_ptr =
+      reinterpret_cast<CleanupMessage*>(cleanup_request_message.data_.get());
+  cleanup_message_ptr->id = id;
+  cleanup_message_ptr->waiting_on_stub = false;
+  ipc_message->Args() = cleanup_request_message.handle_;
+
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lock{
+        *(ipc_message->ResponseMutex())};
+    SendIPCUtilsMessage(ipc_message);
+    while (!cleanup_message_ptr->waiting_on_stub) {
+      ipc_message->ResponseCondition()->wait(lock);
     }
+  }
+}
 
-    py::list py_request_list;
-    for (size_t i = 0; i < batch_size; i++) {
-      Request* request = &requests[i];
-      py::object infer_request;
-      try {
-        ProcessRequest(
-            request, response_batch_, infer_request, PyRequest_, PyTensor_,
-            deserialize_bytes_);
-      }
-      catch (const PythonBackendException& pb_exception) {
-        LOG_EXCEPTION(pb_exception);
-        SetResponseFromException(pb_exception);
-        return 0;
-      }
-      py_request_list.append(infer_request);
+void
+Stub::EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type)
+{
+  if (id != nullptr) {
+    std::unique_ptr<UtilsMessagePayload> utils_msg_payload =
+        std::make_unique<UtilsMessagePayload>(command_type, id);
+    EnqueueUtilsMessage(std::move(utils_msg_payload));
+  }
+}
+
+void
+Stub::SendCancelBLSRequest(
+    std::unique_ptr<UtilsMessagePayload>& utils_msg_payload)
+{
+  PbBLSCancel* pb_bls_cancel =
+      reinterpret_cast<PbBLSCancel*>(utils_msg_payload->utils_message_ptr);
+  pb_bls_cancel->SaveToSharedMemory(shm_pool_);
+
+  CancelBLSRequestMessage* message_payload = pb_bls_cancel->ShmPayload();
+  std::unique_ptr<IPCMessage> ipc_message =
+      IPCMessage::Create(shm_pool_, false /* inline_response */);
+  ipc_message->Command() = utils_msg_payload->command_type;
+  ipc_message->Args() = pb_bls_cancel->ShmHandle();
+
+  bool is_cancelled = false;
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lk(message_payload->mu);
+
+    SendIPCUtilsMessage(ipc_message);
+    while (!message_payload->waiting_on_stub) {
+      message_payload->cv.wait(lk);
     }
 
-    py::list responses;
+    is_cancelled = message_payload->is_cancelled;
+    message_payload->waiting_on_stub = false;
+    message_payload->cv.notify_all();
+  }
+  pb_bls_cancel->ReportIsCancelled(is_cancelled);
+}
 
-    if (!py::hasattr(model_instance_, "execute")) {
-      std::string message = "Python model " + model_path_ +
-                            " does not implement `execute` method.";
-      LOG_INFO << message;
-      SetErrorForResponseBatch(message.c_str());
+void
+Stub::EnqueueCancelBLSRequest(PbBLSCancel* pb_bls_cancel)
+{
+  std::unique_ptr<UtilsMessagePayload> utils_msg_payload =
+      std::make_unique<UtilsMessagePayload>(
+          PYTHONSTUB_CancelBLSInferRequest,
+          reinterpret_cast<void*>(pb_bls_cancel));
+  EnqueueUtilsMessage(std::move(utils_msg_payload));
+}
+
+void
+Stub::EnqueueIsCancelled(PbCancel* pb_cancel)
+{
+  std::unique_ptr<UtilsMessagePayload> utils_msg_payload =
+      std::make_unique<UtilsMessagePayload>(
+          PYTHONSTUB_IsRequestCancelled, reinterpret_cast<void*>(pb_cancel));
+  EnqueueUtilsMessage(std::move(utils_msg_payload));
+}
+
+void
+Stub::SendIsCancelled(std::unique_ptr<UtilsMessagePayload>& utils_msg_payload)
+{
+  PbCancel* pb_cancel =
+      reinterpret_cast<PbCancel*>(utils_msg_payload->utils_message_ptr);
+  pb_cancel->SaveToSharedMemory(shm_pool_);
 
-      return 0;
+  IsCancelledMessage* message_payload = pb_cancel->ShmPayload();
+  std::unique_ptr<IPCMessage> ipc_message =
+      IPCMessage::Create(shm_pool_, false /* inline_response */);
+  ipc_message->Command() = utils_msg_payload->command_type;
+  ipc_message->Args() = pb_cancel->ShmHandle();
+
+  bool is_cancelled = false;
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lk(message_payload->mu);
+
+    SendIPCUtilsMessage(ipc_message);
+    while (!message_payload->waiting_on_stub) {
+      message_payload->cv.wait(lk);
     }
 
-    // Execute Response
-    try {
-      responses = model_instance_.attr("execute")(py_request_list);
+    is_cancelled = message_payload->is_cancelled;
+    message_payload->waiting_on_stub = false;
+    message_payload->cv.notify_all();
+  }
+  pb_cancel->ReportIsCancelled(is_cancelled);
+}
+
+bool
+Stub::StubToParentServiceActive()
+{
+  return stub_to_parent_thread_;
+}
+
+void
+Stub::LaunchParentToStubQueueMonitor()
+{
+  parent_to_stub_thread_ = true;
+  parent_to_stub_queue_monitor_ =
+      std::thread(&Stub::ParentToStubMQMonitor, this);
+}
+
+void
+Stub::TerminateParentToStubQueueMonitor()
+{
+  if (parent_to_stub_thread_) {
+    parent_to_stub_thread_ = false;
+    // Push a dummy message to signal the thread to terminate.
+    parent_to_stub_mq_->Push(DUMMY_MESSAGE);
+    parent_to_stub_queue_monitor_.join();
+  }
+}
+
+void
+Stub::ParentToStubMQMonitor()
+{
+  while (parent_to_stub_thread_) {
+    bi::managed_external_buffer::handle_t handle = parent_to_stub_mq_->Pop();
+    if (handle == DUMMY_MESSAGE) {
+      break;
     }
-    catch (const py::error_already_set& e) {
-      LOG_INFO << e.what();
-      SetErrorForResponseBatch(e.what());
 
-      return 0;
+    std::unique_ptr<IPCMessage> ipc_message =
+        IPCMessage::LoadFromSharedMemory(shm_pool_, handle);
+
+    switch (ipc_message->Command()) {
+      case PYTHONSTUB_CommandType::PYTHONSTUB_CUDAPoolInitializeRequest: {
+        GetCUDAMemoryPoolAddress(ipc_message);
+      } break;
+      case PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecResponse: {
+        ProcessBLSResponseDecoupled(ipc_message);
+      } break;
+      default:
+        break;
     }
+  }
+}
 
-    Response* responses_shm;
-    off_t responses_shm_offset;
-    size_t response_size = py::len(responses);
+bool
+Stub::ParentToStubServiceActive()
+{
+  return parent_to_stub_thread_;
+}
 
-    try {
-      shm_pool_->Map(
-          (char**)&responses_shm, sizeof(Response) * response_size,
-          responses_shm_offset);
-    }
-    catch (const PythonBackendException& pb_exception) {
-      LOG_EXCEPTION(pb_exception);
-      SetResponseFromException(pb_exception);
-      return 0;
+std::shared_ptr<ResponseIterator>
+Stub::GetResponseIterator(std::shared_ptr<InferResponse> infer_response)
+{
+  std::lock_guard<std::mutex> lock(response_iterator_map_mu_);
+  if (response_iterator_map_.find(infer_response->Id()) !=
+      response_iterator_map_.end()) {
+    // Need to re-construct the 'ResponseIterator' and update the
+    // 'response_iterator_map_' to make sure the 'ResponseIterator' object has
+    // the correct first response.
+    auto response_iterator = std::make_shared<ResponseIterator>(infer_response);
+    std::vector<std::shared_ptr<InferResponse>> existing_responses =
+        response_iterator_map_[infer_response->Id()]->GetExistingResponses();
+    for (auto& response : existing_responses) {
+      response_iterator->EnqueueResponse(response);
     }
-    response_batch_->responses = responses_shm_offset;
-    response_batch_->batch_size = response_size;
 
-    size_t i = 0;
-    for (auto& response : responses) {
-      Response* response_shm = &responses_shm[i];
-      try {
-        ProcessResponse(
-            response_shm, response_batch_, response, serialize_bytes_);
-      }
-      catch (const PythonBackendException& pb_exception) {
-        LOG_EXCEPTION(pb_exception);
-        SetErrorForResponse(response_shm, pb_exception.what());
-      }
-      i += 1;
+    response_iterator_map_[infer_response->Id()] = response_iterator;
+  } else {
+    auto response_iterator = std::make_shared<ResponseIterator>(infer_response);
+    response_iterator_map_.insert(
+        std::pair<void*, std::shared_ptr<ResponseIterator>>(
+            response_iterator->Id(), response_iterator));
+  }
+
+  return response_iterator_map_[infer_response->Id()];
+}
+
+bool
+Stub::IsInitialized()
+{
+  return initialized_;
+}
+
+bool
+Stub::IsFinalizing()
+{
+  return finalizing_;
+}
+
+void
+Stub::EnqueueUtilsMessage(
+    std::unique_ptr<UtilsMessagePayload> utils_msg_payload)
+{
+  {
+    std::lock_guard<std::mutex> guard{stub_to_parent_message_mu_};
+    stub_to_parent_buffer_.push(std::move(utils_msg_payload));
+  }
+  stub_to_parent_message_cv_.notify_one();
+}
+
+cudaStream_t
+Stub::GetProxyStream(const int& device_id)
+{
+#ifdef TRITON_ENABLE_GPU
+  std::lock_guard<std::mutex> lock(dlpack_proxy_stream_pool_mu_);
+  if (dlpack_proxy_stream_pool_.find(device_id) ==
+      dlpack_proxy_stream_pool_.end()) {
+    cudaStream_t new_proxy_stream;
+    cudaError_t err = cudaStreamCreate(&new_proxy_stream);
+    if (err == cudaSuccess) {
+      dlpack_proxy_stream_pool_.emplace(device_id, new_proxy_stream);
+      return new_proxy_stream;
+    } else {
+      throw PythonBackendException(
+          "Failed to create a CUDA stream for a DLPack call.");
     }
+  }
+  return dlpack_proxy_stream_pool_[device_id];
+#else
+  return nullptr;
+#endif
+}
+
+void
+Stub::GetCUDAMemoryPoolAddress(std::unique_ptr<IPCMessage>& ipc_message)
+{
+#ifdef TRITON_ENABLE_GPU
+  bool has_exception = false;
+  std::string error_string;
+  std::unique_ptr<PbString> error_string_shm;
+
+  CUDAMemPoolMessage* cuda_pool_message_ptr = nullptr;
+  try {
+    AllocatedSharedMemory<CUDAMemPoolMessage> cuda_handle_shm =
+        shm_pool_->Load<CUDAMemPoolMessage>(ipc_message->Args());
+    cuda_pool_message_ptr = cuda_handle_shm.data_.get();
+
+    CUDAHandler& cuda_api = CUDAHandler::getInstance();
+    void* cuda_pool_address;
+    cuda_api.OpenCudaHandle(
+        cuda_pool_message_ptr->device_id, &cuda_pool_message_ptr->cuda_handle,
+        &cuda_pool_address);
+    shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress(
+        cuda_pool_message_ptr->device_id, cuda_pool_address);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    has_exception = true;
+    error_string = pb_exception.what();
+    shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress(
+        cuda_pool_message_ptr->device_id, nullptr);
+  }
 
-    return 0;
+  if (has_exception) {
+    LOG_INFO << "Failed to initialize CUDA shared memory pool in Python stub: "
+             << error_string;
+    cuda_pool_message_ptr->has_error = true;
+    cuda_pool_message_ptr->is_error_set = false;
+
+    LOG_IF_EXCEPTION(
+        error_string_shm = PbString::Create(shm_pool_, error_string));
+    if (error_string_shm != nullptr) {
+      cuda_pool_message_ptr->is_error_set = true;
+      cuda_pool_message_ptr->error = error_string_shm->ShmHandle();
+    }
   }
 
-  void Initialize(std::string& model_version, std::string triton_install_path)
   {
-    try {
-      try {
-        py::module sys = py::module::import("sys");
-
-        std::string model_name =
-            model_path_.substr(model_path_.find_last_of("/") + 1);
-        std::string model_path_parent =
-            model_path_.substr(0, model_path_.find_last_of("/"));
-        std::string model_path_parent_parent =
-            model_path_parent.substr(0, model_path_parent.find_last_of("/"));
-        std::string python_backend_folder = triton_install_path;
-        sys.attr("path").attr("append")(model_path_parent);
-        sys.attr("path").attr("append")(model_path_parent_parent);
-        sys.attr("path").attr("append")(python_backend_folder);
-
-        py::module python_backend_utils =
-            py::module::import("triton_python_backend_utils");
-
-        py::object TritonPythonModel =
-            py::module::import((model_version + std::string(".model")).c_str())
-                .attr("TritonPythonModel");
-        PyRequest_ = python_backend_utils.attr("InferenceRequest");
-        PyTensor_ = python_backend_utils.attr("Tensor");
-        deserialize_bytes_ =
-            python_backend_utils.attr("deserialize_bytes_tensor");
-        serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor");
-        model_instance_ = TritonPythonModel();
-
-        std::unordered_map<std::string, std::string> map;
-        LoadMapFromSharedMemory(shm_pool_, ipc_message_->request_batch, map);
-        py::dict model_config_params;
-
-        for (const auto& pair : map) {
-          model_config_params[pair.first.c_str()] = pair.second;
-        }
-        // Call initialize if exists.
-        if (py::hasattr(model_instance_, "initialize")) {
-          model_instance_.attr("initialize")(model_config_params);
-        }
-      }
+    bi::scoped_lock<bi::interprocess_mutex> lock{
+        *(ipc_message->ResponseMutex())};
+    cuda_pool_message_ptr->waiting_on_stub = true;
+    ipc_message->ResponseCondition()->notify_all();
+    while (cuda_pool_message_ptr->waiting_on_stub) {
+      ipc_message->ResponseCondition()->wait(lock);
+    }
+  }
+#endif
+}
 
-      catch (const py::error_already_set& e) {
-        LOG_INFO << e.what();
-        SetErrorForResponseBatch(e.what());
+void
+Stub::ProcessBLSResponseDecoupled(std::unique_ptr<IPCMessage>& ipc_message)
+{
+  ResponseBatch* response_batch = nullptr;
+  bi::managed_external_buffer::handle_t* response_handle = nullptr;
+  std::unique_ptr<InferResponse> infer_response;
+  bool responses_is_set = false;
+  PythonBackendException pb_exception(std::string{});
 
-        NotifyParent();
-        exit(1);
+  try {
+    AllocatedSharedMemory<char> response_batch_shm =
+        shm_pool_->Load<char>(ipc_message->Args());
+    response_batch =
+        reinterpret_cast<ResponseBatch*>(response_batch_shm.data_.get());
+    response_handle = reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+        response_batch_shm.data_.get() + sizeof(ResponseBatch));
+    responses_is_set = true;
+
+    if (response_batch->has_error) {
+      if (response_batch->is_error_set) {
+        std::unique_ptr<PbString> pb_string =
+            PbString::LoadFromSharedMemory(shm_pool_, response_batch->error);
+        infer_response = std::make_unique<InferResponse>(
+            std::vector<std::shared_ptr<PbTensor>>{},
+            std::make_shared<PbError>(pb_string->String()));
+      } else {
+        infer_response = std::make_unique<InferResponse>(
+            std::vector<std::shared_ptr<PbTensor>>{},
+            std::make_shared<PbError>(
+                "An error occurred while performing BLS request."));
       }
     }
-    catch (const PythonBackendException& pb_exception) {
-      LOG_INFO << "Failed to initialize Python stub: " << pb_exception.what();
-      NotifyParent();
-      exit(1);
+
+    if (responses_is_set) {
+      infer_response = InferResponse::LoadFromSharedMemory(
+          shm_pool_, *response_handle, true /* open cuda handle */);
+
+      for (auto& output_tensor : infer_response->OutputTensors()) {
+        if (!output_tensor->IsCPU()) {
+          uint64_t memory_release_id =
+              output_tensor->Memory()->MemoryReleaseId();
+          output_tensor->Memory()->SetMemoryReleaseCallback(
+              [this, memory_release_id]() {
+                this->MemoryManagerQueue()->Push(memory_release_id);
+              });
+        }
+      }
+    } else {
+      infer_response = std::make_unique<InferResponse>(
+          std::vector<std::shared_ptr<PbTensor>>{},
+          std::make_shared<PbError>(
+              "An error occurred while performing BLS request."));
     }
   }
+  catch (const PythonBackendException& pb_exception) {
+    infer_response = std::make_unique<InferResponse>(
+        std::vector<std::shared_ptr<PbTensor>>{},
+        std::make_shared<PbError>(pb_exception.what()));
+  }
 
-  void UpdateHealth()
   {
-    bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_);
-    ipc_message_->health = true;
+    std::lock_guard<std::mutex> lock(response_iterator_map_mu_);
+    if (response_iterator_map_.find(infer_response->Id()) !=
+        response_iterator_map_.end()) {
+      response_iterator_map_[infer_response->Id()]->EnqueueResponse(
+          std::move(infer_response));
+    } else {
+      auto response_iterator =
+          std::make_shared<ResponseIterator>(std::move(infer_response));
+      response_iterator_map_.insert(
+          std::pair<void*, std::shared_ptr<ResponseIterator>>(
+              response_iterator->Id(), response_iterator));
+    }
   }
 
-  void Finalize()
   {
-    // Call finalize if exists.
-    if (py::hasattr(model_instance_, "finalize")) {
-      try {
-        model_instance_.attr("finalize")();
-      }
-      catch (const py::error_already_set& e) {
-        LOG_INFO << e.what();
-      }
+    bi::scoped_lock<bi::interprocess_mutex> lock{
+        *(ipc_message->ResponseMutex())};
+    response_batch->waiting_on_stub = true;
+    ipc_message->ResponseCondition()->notify_all();
+  }
+}
+
+PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
+{
+  py::class_<PbError, std::shared_ptr<PbError>> triton_error(
+      module, "TritonError");
+  py::enum_<TRITONSERVER_Error_Code>(triton_error, "__ErrorCode")
+      .value("UNKNOWN", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_UNKNOWN)
+      .value("INTERNAL", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_INTERNAL)
+      .value("NOT_FOUND", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_NOT_FOUND)
+      .value(
+          "INVALID_ARG",
+          TRITONSERVER_Error_Code::TRITONSERVER_ERROR_INVALID_ARG)
+      .value(
+          "UNAVAILABLE",
+          TRITONSERVER_Error_Code::TRITONSERVER_ERROR_UNAVAILABLE)
+      .value(
+          "UNSUPPORTED",
+          TRITONSERVER_Error_Code::TRITONSERVER_ERROR_UNSUPPORTED)
+      .value(
+          "ALREADY_EXISTS",
+          TRITONSERVER_Error_Code::TRITONSERVER_ERROR_ALREADY_EXISTS)
+      .value("CANCELLED", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_CANCELLED)
+      .export_values();
+  triton_error.def_property_readonly_static(
+      "UNKNOWN",
+      [](py::object /* self */) { return TRITONSERVER_ERROR_UNKNOWN; });
+  triton_error.def_property_readonly_static(
+      "INTERNAL",
+      [](py::object /* self */) { return TRITONSERVER_ERROR_INTERNAL; });
+  triton_error.def_property_readonly_static(
+      "NOT_FOUND",
+      [](py::object /* self */) { return TRITONSERVER_ERROR_NOT_FOUND; });
+  triton_error.def_property_readonly_static(
+      "INVALID_ARG",
+      [](py::object /* self */) { return TRITONSERVER_ERROR_INVALID_ARG; });
+  triton_error.def_property_readonly_static(
+      "UNAVAILABLE",
+      [](py::object /* self */) { return TRITONSERVER_ERROR_UNAVAILABLE; });
+  triton_error.def_property_readonly_static(
+      "UNSUPPORTED",
+      [](py::object /* self */) { return TRITONSERVER_ERROR_UNSUPPORTED; });
+  triton_error.def_property_readonly_static(
+      "ALREADY_EXISTS",
+      [](py::object /* self */) { return TRITONSERVER_ERROR_ALREADY_EXISTS; });
+  triton_error.def_property_readonly_static(
+      "CANCELLED",
+      [](py::object /* self */) { return TRITONSERVER_ERROR_CANCELLED; });
+  triton_error.def(
+      py::init<const std::string&, TRITONSERVER_Error_Code>(),
+      py::arg("message").none(false),
+      py::arg("code").none(false) = TRITONSERVER_ERROR_INTERNAL);
+  triton_error.def("code", &PbError::Code);
+  triton_error.def("message", &PbError::Message);
+
+  py::class_<PreferredMemory, std::shared_ptr<PreferredMemory>>(
+      module, "PreferredMemory")
+      .def(
+          py::init<const PreferredMemory::MemoryType&, const int64_t&>(),
+          py::arg("preferred_memory_type").none(false),
+          py::arg("preferred_device_id").none(false) = 0);
+
+  py::enum_<PreferredMemory::MemoryType>(module, "MemoryType")
+      .value("TRITONSERVER_MEMORY_GPU", PreferredMemory::MemoryType::kGPU)
+      .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::kCPU)
+      .export_values();
+
+  py::class_<InferenceTrace, std::shared_ptr<InferenceTrace>>(
+      module, "InferenceTrace")
+      .def("get_context", [](InferenceTrace& self) -> py::object {
+        auto context = self.Context();
+        if (context != "") {
+          return py::str(context);
+        }
+        return py::none();
+      });
+
+  py::class_<InferRequest, std::shared_ptr<InferRequest>>(
+      module, "InferenceRequest")
+      .def(
+          py::init(
+              [](const std::string& request_id,
+                 const py::object& correlation_id,
+                 const std::vector<std::shared_ptr<PbTensor>>& inputs,
+                 const std::vector<std::string>& requested_output_names,
+                 const std::string& model_name, const int64_t model_version,
+                 const uint32_t flags, const uint64_t timeout,
+                 const PreferredMemory& preferred_memory,
+                 const InferenceTrace& trace, const py::object& parameters_) {
+                py::dict parameters =
+                    PyDefaultArgumentToMutableType<py::dict>(parameters_);
+                std::set<std::string> requested_outputs;
+                for (auto& requested_output_name : requested_output_names) {
+                  requested_outputs.emplace(requested_output_name);
+                }
+                std::string parameters_str = PyParametersToJSON(parameters);
+
+                CorrelationId correlation_id_obj;
+                if (py::isinstance<py::int_>(correlation_id)) {
+                  correlation_id_obj =
+                      CorrelationId(py::cast<uint64_t>(correlation_id));
+                } else if (py::isinstance<py::str>(correlation_id)) {
+                  correlation_id_obj =
+                      CorrelationId(py::cast<std::string>(correlation_id));
+                } else {
+                  throw PythonBackendException(
+                      "Correlation ID must be integer or string");
+                }
+
+                return std::make_shared<InferRequest>(
+                    request_id, correlation_id_obj, inputs, requested_outputs,
+                    model_name, model_version, parameters_str, flags, timeout,
+                    0 /*response_factory_address*/, 0 /*request_address*/,
+                    preferred_memory, trace);
+              }),
+          py::arg("request_id").none(false) = "",
+          py::arg("correlation_id").none(false) = 0,
+          py::arg("inputs").none(false),
+          py::arg("requested_output_names").none(false),
+          py::arg("model_name").none(false),
+          py::arg("model_version").none(false) = -1,
+          py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0,
+          py::arg("preferred_memory").none(false) =
+              PreferredMemory(PreferredMemory::kDefault, 0),
+          py::arg("trace").none(false) = InferenceTrace(),
+          py::arg("parameters").none(true) = py::none())
+      .def(
+          "inputs", &InferRequest::Inputs,
+          py::return_value_policy::reference_internal)
+      .def("request_id", &InferRequest::RequestId)
+      .def(
+          "correlation_id",
+          [](InferRequest& self) -> py::object {
+            CorrelationId correlation_id = self.GetCorrelationId();
+            if (correlation_id.Type() == CorrelationIdDataType::STRING) {
+              return py::cast(correlation_id.StringValue());
+            } else {
+              return py::cast(correlation_id.UnsignedIntValue());
+            }
+          })
+      .def("flags", &InferRequest::Flags)
+      .def("set_flags", &InferRequest::SetFlags)
+      .def("timeout", &InferRequest::Timeout)
+      .def("parameters", &InferRequest::Parameters)
+      .def("trace", &InferRequest::GetTrace)
+      .def(
+          "exec",
+          [](std::shared_ptr<InferRequest>& infer_request,
+             const bool decoupled) {
+            std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+            std::shared_ptr<InferResponse> response =
+                infer_request->Exec(decoupled);
+            py::object response_object;
+            if (decoupled) {
+              auto response_iterator = stub->GetResponseIterator(response);
+              response_object = py::cast(response_iterator);
+            } else {
+              response_object = py::cast(response);
+            }
+
+            return response_object;
+          },
+          py::arg("decoupled").none(false) = false)
+      .def(
+          "async_exec",
+          [](std::shared_ptr<InferRequest>& infer_request,
+             const bool decoupled) {
+            std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+            py::object loop =
+                py::module_::import("asyncio").attr("get_running_loop")();
+            py::cpp_function callback = [&stub, infer_request, decoupled]() {
+              std::shared_ptr<InferResponse> response =
+                  infer_request->Exec(decoupled);
+              py::object response_object;
+              if (decoupled) {
+                auto response_iterator = stub->GetResponseIterator(response);
+                response_object = py::cast(response_iterator);
+              } else {
+                response_object = py::cast(response);
+              }
+
+              return response_object;
+            };
+            py::object future =
+                loop.attr("run_in_executor")(py::none(), callback);
+            return future;
+          },
+          py::arg("decoupled").none(false) = false)
+      .def(
+          "requested_output_names", &InferRequest::RequestedOutputNames,
+          py::return_value_policy::reference_internal)
+      .def("get_response_sender", &InferRequest::GetResponseSender)
+      .def("is_cancelled", &InferRequest::IsCancelled)
+      .def("set_release_flags", &InferRequest::SetReleaseFlags),
+      py::arg("flags").none(false);
+
+  py::class_<PbTensor, std::shared_ptr<PbTensor>>(module, "Tensor")
+      .def(py::init(&PbTensor::FromNumpy))
+      .def("name", &PbTensor::Name)
+      // The reference_internal is added to make sure that the NumPy object has
+      // the same lifetime as the tensor object. This means even when the NumPy
+      // object is only in scope, the tensor object is not deallocated from
+      // shared memory to make sure the NumPy object is still valid.
+      .def(
+          "as_numpy", &PbTensor::AsNumpy,
+          py::return_value_policy::reference_internal)
+      .def("triton_dtype", &PbTensor::TritonDtype)
+      .def("to_dlpack", &PbTensor::ToDLPack)
+      .def("is_cpu", &PbTensor::IsCPU)
+      .def("shape", &PbTensor::Dims)
+      .def("from_dlpack", &PbTensor::FromDLPack)
+      .def("__dlpack__", &PbTensor::DLPack, py::arg("stream") = py::none())
+      .def("__dlpack_device__", &PbTensor::DLPackDevice);
+
+  py::class_<InferResponse, std::shared_ptr<InferResponse>>(
+      module, "InferenceResponse")
+      .def(
+          py::init(
+              [](const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
+                 const std::shared_ptr<PbError>& error,
+                 const py::object& parameters_) {
+                py::dict parameters =
+                    PyDefaultArgumentToMutableType<py::dict>(parameters_);
+                std::string parameters_str = PyParametersToJSON(parameters);
+                return std::make_shared<InferResponse>(
+                    output_tensors, error, parameters_str /* parameters */);
+              }),
+          py::arg("output_tensors") = py::list(),
+          py::arg("error") = static_cast<std::shared_ptr<PbError>>(nullptr),
+          py::arg("parameters") = py::none())
+      .def(
+          "output_tensors", &InferResponse::OutputTensors,
+          py::return_value_policy::reference)
+      .def("has_error", &InferResponse::HasError)
+      .def("error", &InferResponse::Error)
+      .def("parameters", &InferResponse::Parameters);
+
+  py::class_<ResponseSender, std::shared_ptr<ResponseSender>>(
+      module, "InferenceResponseSender")
+      .def(
+          "send", &ResponseSender::Send, py::arg("response") = nullptr,
+          py::arg("flags") = 0)
+      .def("is_cancelled", &ResponseSender::IsCancelled);
+
+  py::class_<ResponseIterator, std::shared_ptr<ResponseIterator>>(
+      module, "ResponseIterator")
+      .def(py::init<const std::shared_ptr<InferResponse>&>())
+      .def(
+          "__iter__",
+          [](ResponseIterator& it) -> ResponseIterator& {
+            it.Iter();
+            return it;
+          })
+      .def("__next__", &ResponseIterator::Next)
+      .def("cancel", &ResponseIterator::Cancel);
+
+  py::class_<Logger> logger(module, "Logger");
+  py::enum_<LogLevel>(logger, "LogLevel")
+      .value("INFO", LogLevel::kInfo)
+      .value("WARNING", LogLevel::kWarning)
+      .value("ERROR", LogLevel::kError)
+      .value("VERBOSE", LogLevel::kVerbose)
+      .export_values();
+  logger.def_static(
+      "log", py::overload_cast<const std::string&, LogLevel>(&Logger::Log),
+      py::arg("message"), py::arg("level") = LogLevel::kInfo);
+  logger.def_static("log_info", &Logger::LogInfo, py::arg("message"));
+  logger.def_static("log_warn", &Logger::LogWarn, py::arg("message"));
+  logger.def_static("log_error", &Logger::LogError, py::arg("message"));
+  logger.def_static("log_verbose", &Logger::LogVerbose, py::arg("message"));
+
+  py::class_<Metric, std::shared_ptr<Metric>>(module, "Metric")
+      .def("increment", &Metric::SendIncrementRequest)
+      .def("set", &Metric::SendSetValueRequest)
+      .def("observe", &Metric::SendObserveRequest)
+      .def("value", &Metric::SendGetValueRequest);
+
+  py::enum_<MetricKind>(module, "MetricKind")
+      .value("COUNTER", MetricKind::kCounter)
+      .value("GAUGE", MetricKind::kGauge)
+      .value("HISTOGRAM", MetricKind::kHistogram)
+      .export_values();
+
+  py::class_<MetricFamily, std::shared_ptr<MetricFamily>>(
+      module, "MetricFamily")
+      .def(
+          py::init(&MetricFamily::CreateMetricFamily),
+          py::arg("name").none(false), py::arg("description").none(false),
+          py::arg("kind").none(false))
+      .def(
+          "Metric", &MetricFamily::CreateMetric,
+          py::arg("labels").none(true) = py::none(),
+          py::arg("buckets").none(true) = py::none());
+  module.attr("MetricFamily").attr("COUNTER") = MetricKind::kCounter;
+  module.attr("MetricFamily").attr("GAUGE") = MetricKind::kGauge;
+  module.attr("MetricFamily").attr("HISTOGRAM") = MetricKind::kHistogram;
+
+  module.def(
+      "load_model", &LoadModel, py::arg("model_name").none(false),
+      py::arg("config").none(false) = "",
+      py::arg("files").none(true) = py::none());
+  module.def(
+      "unload_model", &UnloadModel, py::arg("model_name").none(false),
+      py::arg("unload_dependents").none(false) = false);
+  module.def(
+      "is_model_ready", &IsModelReady, py::arg("model_name").none(false),
+      py::arg("model_version").none(false) = "");
+
+  // This function is not part of the public API for Python backend. This is
+  // only used for internal callbacks.
+  module.def(
+      "async_event_future_done_callback", &AsyncEventFutureDoneCallback,
+      py::arg("py_future").none(false));
+
+  // This class is not part of the public API for Python backend. This is only
+  // used for internal testing purposes.
+  py::class_<SharedMemoryManager>(module, "SharedMemory")
+      .def("free_memory", &SharedMemoryManager::FreeMemory);
+
+  py::register_exception<PythonBackendException>(
+      module, "TritonModelException");
+}
+
+
+void
+ModelContext::Init(
+    const std::string& model_path, const std::string& runtime_modeldir,
+    const std::string& triton_install_path, const std::string& model_version)
+{
+  const char os_slash = std::filesystem::path::preferred_separator;
+  type_ = ModelType::kDefault;
+  if (runtime_modeldir != "DEFAULT") {
+    // For python based backends, existence of `model.py` in the corresponding
+    // backend folder happens on the core side, so we can omit this check here.
+    python_model_path_ = runtime_modeldir + os_slash + "model.py";
+    type_ = ModelType::kBackend;
+  } else {
+    python_model_path_ = model_path;
+    // Check if model file exists in this path.
+    struct stat buffer;
+    if (stat(python_model_path_.c_str(), &buffer) != 0) {
+      throw PythonBackendException(
+          ("Python model file not found in \'" + model_path + "\'"));
     }
   }
 
-  // Wait for notification from the server. Returns true if the parent process
-  // has received a SIGTERM, and false otherwise.
-  bool WaitForNotification()
-  {
-    boost::posix_time::ptime timeout;
-    do {
-      timeout =
-          boost::get_system_time() + boost::posix_time::milliseconds(1000);
-    } while (!stub_cond_->timed_wait(stub_lock_, timeout) != 0 &&
-             !sigterm_received);
-    return sigterm_received;
+  model_dir_ = model_path.substr(0, model_path.find_last_of(os_slash));
+  python_backend_folder_ = triton_install_path;
+  model_version_ = model_version;
+  runtime_modeldir_ = runtime_modeldir;
+}
+
+void
+ModelContext::StubSetup(py::module& sys)
+{
+  const char os_slash = std::filesystem::path::preferred_separator;
+  std::string model_name =
+      python_model_path_.substr(python_model_path_.find_last_of(os_slash) + 1);
+
+  // Model name without the .py extension
+  auto dotpy_pos = model_name.find_last_of(".py");
+  if (dotpy_pos == std::string::npos || dotpy_pos != model_name.size() - 1) {
+    throw PythonBackendException(
+        "Model name must end with '.py'. Model name is \"" + model_name +
+        "\".");
+  }
+  // The position of last character of the string that is searched for is
+  // returned by 'find_last_of'. Need to manually adjust the position.
+  std::string model_name_trimmed = model_name.substr(0, dotpy_pos - 2);
+
+  if (type_ == ModelType::kDefault) {
+    std::string model_path_parent =
+        python_model_path_.substr(0, python_model_path_.find_last_of(os_slash));
+    std::string model_path_parent_parent =
+        model_path_parent.substr(0, model_path_parent.find_last_of(os_slash));
+    sys.attr("path").attr("append")(model_path_parent);
+    sys.attr("path").attr("append")(model_path_parent_parent);
+    sys.attr("path").attr("append")(python_backend_folder_);
+    sys = py::module_::import(
+        (std::string(model_version_) + "." + model_name_trimmed).c_str());
+  } else {
+    std::string model_path_parent =
+        python_model_path_.substr(0, python_model_path_.find_last_of(os_slash));
+    std::string backend_model_dir(model_path_parent);
+    sys.attr("path").attr("append")(backend_model_dir);
+    sys.attr("path").attr("append")(python_backend_folder_);
+    sys = py::module_::import(model_name_trimmed.c_str());
   }
-};
+}
+
+#ifdef _WIN32
+bool
+ParentProcessActive(DWORD parent_id)
+{
+  HANDLE parent = OpenProcess(PROCESS_ALL_ACCESS, FALSE, parent_id);
+  DWORD exit_code;
+  GetExitCodeProcess(parent, &exit_code);
+  return (exit_code == STILL_ACTIVE);
+}
+#else
+bool
+ParentProcessActive(pid_t parent_id)
+{
+  return (kill(parent_id, 0) == 0);
+}
+#endif
 
 extern "C" {
 
 int
 main(int argc, char** argv)
 {
-  if (argc < 7) {
-    LOG_INFO << "Expected 7 arguments, found " << argc << " arguments.";
+  std::unique_ptr<Logger>& logger = Logger::GetOrCreateInstance();
+  if (argc < 9) {
+    LOG_INFO << "Expected 9 arguments, found " << argc << " arguments.";
+    logger.reset();
     exit(1);
   }
   signal(SIGINT, SignalHandler);
-  signal(SIGTERM, SigtermHandler);
+  signal(SIGTERM, SignalHandler);
 
-  // Path to model.py
+  // Path to model
   std::string model_path = argv[1];
   std::string shm_region_name = argv[2];
-  int64_t shm_default_size = std::stoi(argv[3]);
+  int64_t shm_default_size = std::stol(argv[3]);
 
   std::vector<std::string> model_path_tokens;
 
   // Find the package name from model path.
   size_t prev = 0, pos = 0;
+  const char os_slash = std::filesystem::path::preferred_separator;
   do {
-    pos = model_path.find("/", prev);
+    pos = model_path.find(os_slash, prev);
     if (pos == std::string::npos)
       pos = model_path.length();
     std::string token = model_path.substr(prev, pos - prev);
@@ -716,80 +2014,105 @@ main(int argc, char** argv)
 
   if (model_path_tokens.size() < 2) {
     LOG_INFO << "Model path does not look right: " << model_path;
+    logger.reset();
     exit(1);
   }
   std::string model_version = model_path_tokens[model_path_tokens.size() - 2];
-  int64_t shm_growth_size = std::stoi(argv[4]);
-  pid_t parent_pid = std::stoi(argv[5]);
+  int64_t shm_growth_size = std::stol(argv[4]);
   std::string triton_install_path = argv[6];
+  std::string name = argv[8];
+  std::string runtime_modeldir = argv[9];
 
-  std::unique_ptr<Stub> stub;
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
   try {
-    stub = std::make_unique<Stub>(
-        shm_growth_size, shm_default_size, shm_region_name, model_path);
+    stub->Instantiate(
+        shm_growth_size, shm_default_size, shm_region_name, model_path,
+        model_version, argv[6] /* triton install path */,
+        std::stoi(argv[7]) /* IPCControl handle */, name, runtime_modeldir);
   }
   catch (const PythonBackendException& pb_exception) {
     LOG_INFO << "Failed to preinitialize Python stub: " << pb_exception.what();
-    exit(1);
-  }
-
-  // Exit if it has received a SIGTERM signal.
-  if (stub->WaitForNotification()) {
-    LOG_INFO << "Received SIGTERM: exiting.";
+    logger.reset();
+    stub.reset();
     exit(1);
   }
 
   // Start the Python Interpreter
   py::scoped_interpreter guard{};
-
-  stub->Initialize(model_version, argv[6] /* triton install path */);
-  std::atomic<bool> non_graceful_exit = {false};
-
+#ifdef _WIN32
+  DWORD parent_pid = (DWORD)std::stoul(argv[5]);
+#else
+  pid_t parent_pid = std::stoi(argv[5]);
+#endif
   std::atomic<bool> background_thread_running = {true};
-  std::thread background_thread(
-      [&parent_pid, &background_thread_running, &stub, &non_graceful_exit] {
+  std::thread background_thread =
+      std::thread([&parent_pid, &background_thread_running, &stub, &logger] {
+        // Send a dummy message after the stub process is launched to notify the
+        // parent process that the health thread has started.
+        std::unique_ptr<IPCMessage> ipc_message = IPCMessage::Create(
+            stub->SharedMemory(), false /* inline_response */);
+        stub->SendIPCMessage(ipc_message);
+
         while (background_thread_running) {
           // Every 300ms set the health variable to true. This variable is in
           // shared memory and will be set to false by the parent process.
-          // The parent process expects that the stub process sets this variable
-          // to true within 1 second.
-          sleep(0.3);
+          // The parent process expects that the stub process sets this
+          // variable to true within 1 second.
+          std::this_thread::sleep_for(std::chrono::milliseconds(300));
 
           stub->UpdateHealth();
-          if (sigterm_received) {
-            background_thread_running = false;
-          }
 
-          if (kill(parent_pid, 0) != 0) {
+          if (!ParentProcessActive(parent_pid)) {
+            // When unhealthy, we should stop attempting to send
+            // messages to the backend ASAP.
+            if (stub->StubToParentServiceActive()) {
+              stub->TerminateStubToParentQueueMonitor();
+            }
+            if (stub->ParentToStubServiceActive()) {
+              stub->TerminateParentToStubQueueMonitor();
+            }
             // Destroy Stub
-            stub.reset();
             LOG_INFO << "Non-graceful termination detected. ";
             background_thread_running = false;
             non_graceful_exit = true;
-            sigterm_received = true;
+
+            // Destroy stub and exit.
+            logger.reset();
+            stub.reset();
+            exit(1);
           }
         }
       });
 
-  // Wait for messages from the parent process
+  // The stub process will always keep listening for new notifications from the
+  // parent process. After the notification is received the stub process will
+  // run the appropriate command and wait for new notifications.
+  bool finalize = false;
   while (true) {
-    stub->NotifyParent();
-    if (stub->WaitForNotification()) {
+    if (finalize) {
+      stub->Finalize();
+      // Need check or may receive not joinable error
+      if (stub->StubToParentServiceActive()) {
+        stub->TerminateStubToParentQueueMonitor();
+      }
+      if (stub->ParentToStubServiceActive()) {
+        stub->TerminateParentToStubQueueMonitor();
+      }
+      background_thread_running = false;
+      background_thread.join();
       break;
     }
-
-    int stop = stub->Execute();
-    if (stop)
-      break;
+    finalize = stub->RunCommand();
   }
 
-  if (!non_graceful_exit) {
-    stub->Finalize();
-    stub->NotifyParent();
-  }
+  // Stub must be destroyed before the py::scoped_interpreter goes out of
+  // scope. The reason is that stub object has some attributes that are Python
+  // objects. If the scoped_interpreter is destroyed before the stub object,
+  // this process will no longer hold the GIL lock and destruction of the stub
+  // will result in segfault.
+  logger.reset();
+  stub.reset();
 
-  background_thread_running = false;
-  background_thread.join();
   return 0;
 }
 }
diff --git a/src/pb_stub.h b/src/pb_stub.h
new file mode 100644
index 00000000..942ecd98
--- /dev/null
+++ b/src/pb_stub.h
@@ -0,0 +1,374 @@
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include "infer_request.h"
+#include "infer_response.h"
+#include "ipc_message.h"
+#include "message_queue.h"
+#include "metric.h"
+#include "metric_family.h"
+#include "pb_cancel.h"
+#include "pb_log.h"
+#include "pb_response_iterator.h"
+
+
+namespace bi = boost::interprocess;
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+#ifndef TRITON_ENABLE_GPU
+using cudaStream_t = void*;
+#endif
+
+namespace triton { namespace backend { namespace python {
+
+class ModelContext {
+ public:
+  // Scans and establishes path for serving the python model.
+  void Init(
+      const std::string& model_path, const std::string& platform,
+      const std::string& triton_install_path, const std::string& model_version);
+  // Sets up the python stub with appropriate paths.
+  void StubSetup(py::module& sys);
+
+  std::string& PythonModelPath() { return python_model_path_; }
+  std::string& ModelDir() { return model_dir_; }
+
+ private:
+  std::string python_model_path_;
+  std::string model_dir_;
+  std::string model_version_;
+  std::string python_backend_folder_;
+  std::string runtime_modeldir_;
+
+  // Triton supports python-based backends,
+  // i.e. backends that provide common `model.py`, that can be re-used
+  // between different models. `ModelType` helps to differentiate
+  // between models running with c++ python backend (ModelType::kDefault)
+  // and models running with python-based backend (ModelType::kBackend)
+  // at the time of ModelContext::StubSetup to properly set up paths.
+  enum ModelType { kDefault, kBackend };
+  ModelType type_;
+};
+
+// The payload for the stub_to_parent message queue. This struct serves as a
+// wrapper for different types of messages so that they can be sent through the
+// same buffer.
+struct UtilsMessagePayload {
+  UtilsMessagePayload(
+      const PYTHONSTUB_CommandType& command_type, void* utils_message_ptr)
+      : command_type(command_type), utils_message_ptr(utils_message_ptr)
+  {
+  }
+  PYTHONSTUB_CommandType command_type;
+  void* utils_message_ptr;
+};
+
+class Stub {
+ public:
+  Stub() : stub_to_parent_thread_(false), parent_to_stub_thread_(false){};
+  static std::unique_ptr<Stub>& GetOrCreateInstance();
+
+  /// Instantiate a new Python backend Stub.
+  void Instantiate(
+      int64_t shm_growth_size, int64_t shm_default_size,
+      const std::string& shm_region_name, const std::string& model_path,
+      const std::string& model_version, const std::string& triton_install_path,
+      bi::managed_external_buffer::handle_t ipc_control_handle,
+      const std::string& model_instance_name,
+      const std::string& runtime_modeldir);
+
+  /// Get the health of the stub process.
+  bool& Health();
+
+  /// Get the shared memory manager.
+  std::unique_ptr<SharedMemoryManager>& SharedMemory();
+
+  /// Run a single command from the shared memory.
+  bool RunCommand();
+
+  /// Setup for the stub process
+  py::module StubSetup();
+
+  /// Return the path to the model
+  py::str GetModelDir() { return model_context_.ModelDir(); }
+
+  /// Set the model configuration for auto-complete
+  void AutoCompleteModelConfig(
+      bi::managed_external_buffer::handle_t string_handle,
+      std::string* auto_complete_config);
+
+  /// Initialize the user's Python code.
+  void Initialize(bi::managed_external_buffer::handle_t map_handle);
+
+  /// Send a message to the parent process.
+  void SendIPCMessage(std::unique_ptr<IPCMessage>& ipc_message);
+
+  /// Send a utils message to the parent process.
+  void SendIPCUtilsMessage(std::unique_ptr<IPCMessage>& ipc_message);
+
+  /// Receive a message from the parent process.
+  std::unique_ptr<IPCMessage> PopMessage();
+
+  /// Update the health variable in the stub process.
+  void UpdateHealth();
+
+  /// Finalize and terminate the stub process
+  void Finalize();
+
+  /// Load all the requests from shared memory
+  py::list LoadRequestsFromSharedMemory(RequestBatch* request_batch_shm_ptr);
+
+  /// Execute a batch of requests.
+  void ProcessRequests(RequestBatch* request_batch_shm_ptr);
+
+  void ProcessReturnedResponses(
+      py::list py_requests, py::object py_responses_obj,
+      std::optional<AllocatedSharedMemory<char>>& response_batch);
+
+  void ProcessResponse(InferResponse* response);
+
+  py::object GetAsyncEventLoop();
+
+  py::object RunCoroutine(py::object coroutine, bool in_background);
+
+  void BackgroundFutureDone(const py::object& py_future);
+
+  /// Get the memory manager message queue
+  std::unique_ptr<MessageQueue<uint64_t>>& MemoryManagerQueue();
+
+  /// Get the shared memory pool
+  std::unique_ptr<SharedMemoryManager>& ShmPool() { return shm_pool_; }
+
+  void ProcessBLSResponseDecoupled(std::unique_ptr<IPCMessage>& ipc_message);
+
+  void LoadGPUBuffers(std::unique_ptr<IPCMessage>& ipc_message);
+
+  bool IsDecoupled();
+  ~Stub();
+
+  /// Start stub to parent message handler process
+  void LaunchStubToParentQueueMonitor();
+
+  /// End stub to parent message handler process
+  void TerminateStubToParentQueueMonitor();
+
+  /// Add client log to queue
+  void EnqueueLogRequest(std::unique_ptr<PbLog>& log_ptr);
+
+  /// Thread process
+  void ServiceStubToParentRequests();
+
+  /// Send client log to the python backend
+  void SendLogMessage(std::unique_ptr<UtilsMessagePayload>& utils_msg_payload);
+
+  /// Check if stub to parent message handler is running
+  bool StubToParentServiceActive();
+
+  /// Start parent to stub message handler process
+  void LaunchParentToStubQueueMonitor();
+
+  /// End parent to stub message handler process
+  void TerminateParentToStubQueueMonitor();
+
+  /// Check if parent to stub message handler is running
+  bool ParentToStubServiceActive();
+
+  /// Thread process
+  void ParentToStubMQMonitor();
+
+  /// Get the ResponseIterator object associated with the infer response
+  std::shared_ptr<ResponseIterator> GetResponseIterator(
+      std::shared_ptr<InferResponse> infer_response);
+
+  /// Send the id to the python backend for object cleanup
+  void SendCleanupId(
+      std::unique_ptr<UtilsMessagePayload>& utils_msg_payload,
+      const PYTHONSTUB_CommandType& command_type);
+
+  /// Add cleanup id to queue. This is used for cleaning up the infer_payload
+  /// and the response factory for BLS decoupled response.
+  void EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type);
+
+  /// Send the id to the python backend for request address retrieval and
+  /// cancellation
+  void SendCancelBLSRequest(
+      std::unique_ptr<UtilsMessagePayload>& utils_msg_payload);
+
+  /// Add infer payload id to queue. This is used for retrieving the request
+  /// address from the infer_payload
+  void EnqueueCancelBLSRequest(PbBLSCancel* pb_bls_cancel);
+
+  /// Add request cancellation query to queue
+  void EnqueueIsCancelled(PbCancel* pb_cancel);
+
+  /// Send request cancellation query to python backend
+  void SendIsCancelled(std::unique_ptr<UtilsMessagePayload>& utils_msg_payload);
+
+  /// Is the stub initialized
+  bool IsInitialized();
+
+  /// Is the stub in the finalize stage
+  bool IsFinalizing();
+
+  /// Helper function to enqueue a utils message to the stub to parent message
+  /// buffer
+  void EnqueueUtilsMessage(
+      std::unique_ptr<UtilsMessagePayload> utils_msg_payload);
+
+  /// Send the message to the python backend. MessageType should be either
+  // 'MetricFamilyMessage', 'MetricMessage' or 'ModelLoaderMessage'.
+  template <typename MessageType>
+  void SendMessage(
+      AllocatedSharedMemory<MessageType>& msg_shm,
+      PYTHONSTUB_CommandType command_type,
+      bi::managed_external_buffer::handle_t handle);
+
+  /// Helper function to prepare the message. MessageType should be either
+  // 'MetricFamilyMessage', 'MetricMessage' or 'ModelLoaderMessage'.
+  template <typename MessageType>
+  void PrepareMessage(AllocatedSharedMemory<MessageType>& msg_shm);
+
+  /// Helper function to retrieve a proxy stream for dlpack synchronization
+  /// for provided device
+  cudaStream_t GetProxyStream(const int& device_id);
+
+  /// Get the CUDA memory pool address from the parent process.
+  void GetCUDAMemoryPoolAddress(std::unique_ptr<IPCMessage>& ipc_message);
+
+ private:
+  bi::interprocess_mutex* stub_mutex_;
+  bi::interprocess_condition* stub_cond_;
+  bi::interprocess_mutex* parent_mutex_;
+  bi::interprocess_condition* parent_cond_;
+  bi::interprocess_mutex* health_mutex_;
+  ModelContext model_context_;
+  std::string name_;
+  IPCControlShm* ipc_control_;
+  std::unique_ptr<SharedMemoryManager> shm_pool_;
+  py::object model_instance_;
+  py::object deserialize_bytes_;
+  py::object serialize_bytes_;
+  py::object async_event_loop_;
+  py::object background_futures_;
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
+      stub_message_queue_;
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
+      parent_message_queue_;
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
+      stub_to_parent_mq_;
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
+      parent_to_stub_mq_;
+  std::unique_ptr<MessageQueue<uint64_t>> memory_manager_message_queue_;
+  bool initialized_;
+  bool finalizing_;
+  static std::unique_ptr<Stub> stub_instance_;
+  std::vector<std::shared_ptr<PbTensor>> gpu_tensors_;
+  std::queue<std::unique_ptr<UtilsMessagePayload>> stub_to_parent_buffer_;
+  std::thread stub_to_parent_queue_monitor_;
+  bool stub_to_parent_thread_;
+  std::mutex stub_to_parent_message_mu_;
+  std::condition_variable stub_to_parent_message_cv_;
+  std::thread parent_to_stub_queue_monitor_;
+  bool parent_to_stub_thread_;
+  std::mutex response_iterator_map_mu_;
+  std::unordered_map<void*, std::shared_ptr<ResponseIterator>>
+      response_iterator_map_;
+  std::mutex dlpack_proxy_stream_pool_mu_;
+  std::unordered_map<int, cudaStream_t> dlpack_proxy_stream_pool_;
+};
+
+template <typename MessageType>
+void
+Stub::PrepareMessage(AllocatedSharedMemory<MessageType>& msg_shm)
+{
+  msg_shm = shm_pool_->Construct<MessageType>();
+  MessageType* msg = msg_shm.data_.get();
+  new (&(msg->mu)) bi::interprocess_mutex;
+  new (&(msg->cv)) bi::interprocess_condition;
+  msg->waiting_on_stub = false;
+  msg->is_error_set = false;
+  msg->has_error = false;
+}
+
+template <typename MessageType>
+void
+Stub::SendMessage(
+    AllocatedSharedMemory<MessageType>& msg_shm,
+    PYTHONSTUB_CommandType command_type,
+    bi::managed_external_buffer::handle_t handle)
+{
+  PrepareMessage(msg_shm);
+  MessageType* msg = msg_shm.data_.get();
+  msg->message = handle;
+
+  std::unique_ptr<IPCMessage> ipc_message =
+      IPCMessage::Create(shm_pool_, false /* inline_response */);
+  ipc_message->Command() = command_type;
+  ipc_message->Args() = msg_shm.handle_;
+
+  std::unique_lock<std::mutex> guard{stub_to_parent_message_mu_};
+  {
+    ScopedDefer _([&ipc_message, msg] {
+      {
+        bi::scoped_lock<bi::interprocess_mutex> guard{msg->mu};
+        msg->waiting_on_stub = false;
+        msg->cv.notify_all();
+      }
+    });
+
+    {
+      bi::scoped_lock<bi::interprocess_mutex> guard{msg->mu};
+      SendIPCUtilsMessage(ipc_message);
+      while (!msg->waiting_on_stub) {
+        msg->cv.wait(guard);
+      }
+    }
+  }
+  if (msg->has_error) {
+    if (msg->is_error_set) {
+      std::unique_ptr<PbString> pb_string =
+          PbString::LoadFromSharedMemory(shm_pool_, msg->error);
+      std::string err_message =
+          std::string(
+              "Failed to process the request for model '" + name_ +
+              "', message: ") +
+          pb_string->String();
+      throw PythonBackendException(err_message);
+    } else {
+      std::string err_message = std::string(
+          "Failed to process the request for model '" + name_ + "'.");
+      throw PythonBackendException(err_message);
+    }
+  }
+}
+}}}  // namespace triton::backend::python
diff --git a/src/pb_stub_log.cc b/src/pb_stub_log.cc
new file mode 100644
index 00000000..d0b1ff97
--- /dev/null
+++ b/src/pb_stub_log.cc
@@ -0,0 +1,170 @@
+// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_stub_log.h"
+
+#include <pybind11/embed.h>
+
+#include "pb_stub.h"
+
+
+namespace py = pybind11;
+
+namespace triton { namespace backend { namespace python {
+
+std::unique_ptr<Logger> Logger::log_instance_;
+
+std::unique_ptr<Logger>&
+Logger::GetOrCreateInstance()
+{
+  if (Logger::log_instance_.get() == nullptr) {
+    Logger::log_instance_ = std::make_unique<Logger>();
+  }
+
+  return Logger::log_instance_;
+}
+
+// Bound function, called from the python client
+void
+Logger::Log(const std::string& message, LogLevel level)
+{
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+  py::object frame = py::module_::import("inspect").attr("currentframe");
+  py::object caller_frame = frame();
+  py::object info = py::module_::import("inspect").attr("getframeinfo");
+  py::object caller_info = info(caller_frame);
+  py::object filename_python = caller_info.attr("filename");
+  std::string filename = filename_python.cast<std::string>();
+  py::object lineno = caller_info.attr("lineno");
+  uint32_t line = lineno.cast<uint32_t>();
+
+  if (!stub->StubToParentServiceActive()) {
+    Logger::GetOrCreateInstance()->Log(filename, line, level, message);
+  } else {
+    std::unique_ptr<PbLog> log_msg(new PbLog(filename, line, message, level));
+    stub->EnqueueLogRequest(log_msg);
+  }
+}
+
+// Called internally (.e.g. LOG_ERROR << "Error"; )
+void
+Logger::Log(
+    const std::string& filename, uint32_t lineno, LogLevel level,
+    const std::string& message)
+{
+  // If the log monitor service is not active yet, format
+  // and pass messages to cerr
+  if (!BackendLoggingActive()) {
+    std::string path(filename);
+    size_t pos = path.rfind(std::filesystem::path::preferred_separator);
+    if (pos != std::string::npos) {
+      path = path.substr(pos + 1, std::string::npos);
+    }
+#ifdef _WIN32
+    std::stringstream ss;
+    SYSTEMTIME system_time;
+    GetSystemTime(&system_time);
+    ss << LeadingLogChar(level) << std::setfill('0') << std::setw(2)
+       << system_time.wMonth << std::setw(2) << system_time.wDay << ' '
+       << std::setw(2) << system_time.wHour << ':' << std::setw(2)
+       << system_time.wMinute << ':' << std::setw(2) << system_time.wSecond
+       << '.' << std::setw(6) << system_time.wMilliseconds * 1000 << ' '
+       << static_cast<uint32_t>(GetCurrentProcessId()) << ' ' << path << ':'
+       << lineno << "] ";
+#else
+    std::stringstream ss;
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    struct tm tm_time;
+    gmtime_r(((time_t*)&(tv.tv_sec)), &tm_time);
+    ss << LeadingLogChar(level) << std::setfill('0') << std::setw(2)
+       << (tm_time.tm_mon + 1) << std::setw(2) << tm_time.tm_mday << " "
+       << std::setw(2) << tm_time.tm_hour << ':' << std::setw(2)
+       << tm_time.tm_min << ':' << std::setw(2) << tm_time.tm_sec << "."
+       << std::setw(6) << tv.tv_usec << ' ' << static_cast<uint32_t>(getpid())
+       << ' ' << path << ':' << lineno << "] ";
+    std::cerr << ss.str() << " " << message << std::endl;
+#endif
+  } else {
+    // Ensure we do not create a stub instance before it has initialized
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    std::unique_ptr<PbLog> log_msg(new PbLog(filename, lineno, message, level));
+    stub->EnqueueLogRequest(log_msg);
+  }
+}
+
+void
+Logger::LogInfo(const std::string& message)
+{
+  Logger::Log(message, LogLevel::kInfo);
+}
+
+void
+Logger::LogWarn(const std::string& message)
+{
+  Logger::Log(message, LogLevel::kWarning);
+}
+
+void
+Logger::LogError(const std::string& message)
+{
+  Logger::Log(message, LogLevel::kError);
+}
+
+void
+Logger::LogVerbose(const std::string& message)
+{
+  Logger::Log(message, LogLevel::kVerbose);
+}
+
+const std::string
+Logger::LeadingLogChar(const LogLevel& level)
+{
+  switch (level) {
+    case LogLevel::kWarning:
+      return "W";
+    case LogLevel::kError:
+      return "E";
+    case LogLevel::kInfo:
+    case LogLevel::kVerbose:
+    default:
+      return "I";
+  }
+}
+
+void
+Logger::SetBackendLoggingActive(bool status)
+{
+  backend_logging_active_ = status;
+}
+
+bool
+Logger::BackendLoggingActive()
+{
+  return backend_logging_active_;
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_stub_log.h b/src/pb_stub_log.h
new file mode 100644
index 00000000..df67eba8
--- /dev/null
+++ b/src/pb_stub_log.h
@@ -0,0 +1,134 @@
+// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <filesystem>
+#include <sstream>
+
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+
+#define LOG_IF_EXCEPTION(X)                              \
+  do {                                                   \
+    try {                                                \
+      (X);                                               \
+    }                                                    \
+    catch (const PythonBackendException& pb_exception) { \
+      LOG_INFO << pb_exception.what();                   \
+    }                                                    \
+  } while (false)
+
+#define LOG_EXCEPTION(E)  \
+  do {                    \
+    LOG_INFO << E.what(); \
+  } while (false)
+
+/// Macros that use current filename and line number.
+#define LOG_INFO LOG_FL(__FILE__, __LINE__, LogLevel::kInfo)
+#define LOG_WARN LOG_FL(__FILE__, __LINE__, LogLevel::kWarning)
+#define LOG_ERROR LOG_FL(__FILE__, __LINE__, LogLevel::kError)
+#define LOG_VERBOSE LOG_FL(__FILE__, __LINE__, LogLevel::kVerbose)
+
+class Logger {
+ public:
+  Logger() { backend_logging_active_ = false; };
+  ~Logger() { log_instance_.reset(); };
+  /// Python client log function
+  static void Log(const std::string& message, LogLevel level = LogLevel::kInfo);
+
+  /// Python client log info function
+  static void LogInfo(const std::string& message);
+
+  /// Python client warning function
+  static void LogWarn(const std::string& message);
+
+  /// Python client log error function
+  static void LogError(const std::string& message);
+
+  /// Python client log verbose function
+  static void LogVerbose(const std::string& message);
+
+  /// Internal log function
+  void Log(
+      const std::string& filename, uint32_t lineno, LogLevel level,
+      const std::string& message);
+
+  /// Log format helper function
+  const std::string LeadingLogChar(const LogLevel& level);
+
+  /// Set PYBE Logging Status
+  void SetBackendLoggingActive(bool status);
+
+  /// Get PYBE Logging Status
+  bool BackendLoggingActive();
+
+  /// Singleton Getter Function
+  static std::unique_ptr<Logger>& GetOrCreateInstance();
+
+  DISALLOW_COPY_AND_ASSIGN(Logger);
+
+  /// Flush the log.
+  void Flush() { std::cerr << std::flush; }
+
+ private:
+  static std::unique_ptr<Logger> log_instance_;
+  bool backend_logging_active_;
+};
+
+class LogMessage {
+ public:
+  /// Create a log message, stripping the path down to the filename only
+  LogMessage(const char* file, int line, LogLevel level) : level_(level)
+  {
+    std::string path(file);
+    const char os_slash = std::filesystem::path::preferred_separator;
+    size_t pos = path.rfind(os_slash);
+    if (pos != std::string::npos) {
+      path = path.substr(pos + 1, std::string::npos);
+    }
+    file_ = path;
+    line_ = static_cast<uint32_t>(line);
+  }
+  /// Log message to console or send to backend (see Logger::Log for details)
+  ~LogMessage()
+  {
+    Logger::GetOrCreateInstance()->Log(file_, line_, level_, stream_.str());
+  }
+
+  std::stringstream& stream() { return stream_; }
+
+ private:
+  std::stringstream stream_;
+  std::string file_;
+  uint32_t line_;
+  LogLevel level_;
+};
+
+#define LOG_FL(FN, LN, LVL) LogMessage((char*)(FN), LN, LVL).stream()
+
+}}}  // namespace triton::backend::python
diff --git a/src/pb_stub_utils.cc b/src/pb_stub_utils.cc
new file mode 100644
index 00000000..9e05feae
--- /dev/null
+++ b/src/pb_stub_utils.cc
@@ -0,0 +1,321 @@
+// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "pb_stub_utils.h"
+
+#include "pb_utils.h"
+
+namespace triton { namespace backend { namespace python {
+
+TRITONSERVER_DataType
+numpy_to_triton_type(py::object data_type)
+{
+  py::module np = py::module::import("numpy");
+  if (data_type.equal(np.attr("bool_")))
+    return TRITONSERVER_TYPE_BOOL;
+  else if (data_type.equal(np.attr("uint8")))
+    return TRITONSERVER_TYPE_UINT8;
+  else if (data_type.equal(np.attr("uint16")))
+    return TRITONSERVER_TYPE_UINT16;
+  else if (data_type.equal(np.attr("uint32")))
+    return TRITONSERVER_TYPE_UINT32;
+  else if (data_type.equal(np.attr("uint64")))
+    return TRITONSERVER_TYPE_UINT64;
+  else if (data_type.equal(np.attr("int8")))
+    return TRITONSERVER_TYPE_INT8;
+  else if (data_type.equal(np.attr("int16")))
+    return TRITONSERVER_TYPE_INT16;
+  else if (data_type.equal(np.attr("int32")))
+    return TRITONSERVER_TYPE_INT32;
+  else if (data_type.equal(np.attr("int64")))
+    return TRITONSERVER_TYPE_INT64;
+  else if (data_type.equal(np.attr("float16")))
+    return TRITONSERVER_TYPE_FP16;
+  else if (data_type.equal(np.attr("float32")))
+    return TRITONSERVER_TYPE_FP32;
+  else if (data_type.equal(np.attr("float64")))
+    return TRITONSERVER_TYPE_FP64;
+  else if (
+      data_type.equal(np.attr("object_")) ||
+      data_type.equal(np.attr("bytes_")) ||
+      data_type.attr("type").equal(np.attr("bytes_")))
+    return TRITONSERVER_TYPE_BYTES;
+  throw PythonBackendException("NumPy dtype is not supported.");
+}
+
+py::object
+triton_to_numpy_type(TRITONSERVER_DataType data_type)
+{
+  py::module np = py::module::import("numpy");
+  py::object np_type;
+  switch (data_type) {
+    case TRITONSERVER_TYPE_BOOL:
+      np_type = np.attr("bool_");
+      break;
+    case TRITONSERVER_TYPE_UINT8:
+      np_type = np.attr("uint8");
+      break;
+    case TRITONSERVER_TYPE_UINT16:
+      np_type = np.attr("uint16");
+      break;
+    case TRITONSERVER_TYPE_UINT32:
+      np_type = np.attr("uint32");
+      break;
+    case TRITONSERVER_TYPE_UINT64:
+      np_type = np.attr("uint64");
+      break;
+    case TRITONSERVER_TYPE_INT8:
+      np_type = np.attr("int8");
+      break;
+    case TRITONSERVER_TYPE_INT16:
+      np_type = np.attr("int16");
+      break;
+    case TRITONSERVER_TYPE_INT32:
+      np_type = np.attr("int32");
+      break;
+    case TRITONSERVER_TYPE_INT64:
+      np_type = np.attr("int64");
+      break;
+    case TRITONSERVER_TYPE_FP16:
+      np_type = np.attr("float16");
+      break;
+    case TRITONSERVER_TYPE_FP32:
+      np_type = np.attr("float32");
+      break;
+    case TRITONSERVER_TYPE_FP64:
+      np_type = np.attr("float64");
+      break;
+    case TRITONSERVER_TYPE_BYTES:
+      np_type = np.attr("object_");
+      break;
+    default:
+      throw PythonBackendException(
+          "Unsupported triton dtype" +
+          std::to_string(static_cast<int>(data_type)));
+  }
+
+  return np_type;
+}
+
+py::dtype
+triton_to_pybind_dtype(TRITONSERVER_DataType data_type)
+{
+  py::dtype dtype_numpy;
+
+  switch (data_type) {
+    case TRITONSERVER_TYPE_BOOL:
+      dtype_numpy = py::dtype(py::format_descriptor<bool>::format());
+      break;
+    case TRITONSERVER_TYPE_UINT8:
+      dtype_numpy = py::dtype(py::format_descriptor<uint8_t>::format());
+      break;
+    case TRITONSERVER_TYPE_UINT16:
+      dtype_numpy = py::dtype(py::format_descriptor<uint16_t>::format());
+      break;
+    case TRITONSERVER_TYPE_UINT32:
+      dtype_numpy = py::dtype(py::format_descriptor<uint32_t>::format());
+      break;
+    case TRITONSERVER_TYPE_UINT64:
+      dtype_numpy = py::dtype(py::format_descriptor<uint64_t>::format());
+      break;
+    case TRITONSERVER_TYPE_INT8:
+      dtype_numpy = py::dtype(py::format_descriptor<int8_t>::format());
+      break;
+    case TRITONSERVER_TYPE_INT16:
+      dtype_numpy = py::dtype(py::format_descriptor<int16_t>::format());
+      break;
+    case TRITONSERVER_TYPE_INT32:
+      dtype_numpy = py::dtype(py::format_descriptor<int32_t>::format());
+      break;
+    case TRITONSERVER_TYPE_INT64:
+      dtype_numpy = py::dtype(py::format_descriptor<int64_t>::format());
+      break;
+    case TRITONSERVER_TYPE_FP16:
+      // Will be reinterpreted in the python code.
+      dtype_numpy = py::dtype(py::format_descriptor<uint16_t>::format());
+      break;
+    case TRITONSERVER_TYPE_FP32:
+      dtype_numpy = py::dtype(py::format_descriptor<float>::format());
+      break;
+    case TRITONSERVER_TYPE_FP64:
+      dtype_numpy = py::dtype(py::format_descriptor<double>::format());
+      break;
+    case TRITONSERVER_TYPE_BYTES:
+      // Will be reinterpreted in the python code.
+      dtype_numpy = py::dtype(py::format_descriptor<uint8_t>::format());
+      break;
+    case TRITONSERVER_TYPE_BF16:
+      // NOTE: Currently skipping this call via `if (BF16)` check, but may
+      // want to better handle this or set some default/invalid dtype.
+      throw PythonBackendException("TYPE_BF16 not currently supported.");
+    case TRITONSERVER_TYPE_INVALID:
+      throw PythonBackendException("Dtype is invalid.");
+    default:
+      throw PythonBackendException("Unsupported triton dtype.");
+  }
+
+  return dtype_numpy;
+}
+
+DLDataType
+triton_to_dlpack_type(TRITONSERVER_DataType triton_dtype)
+{
+  DLDataType dl_dtype;
+  DLDataTypeCode dl_code;
+
+  // Number of bits required for the data type.
+  size_t dt_size = 0;
+
+  dl_dtype.lanes = 1;
+  switch (triton_dtype) {
+    case TRITONSERVER_TYPE_BOOL:
+      dl_code = DLDataTypeCode::kDLBool;
+      dt_size = 8;
+      break;
+    case TRITONSERVER_TYPE_UINT8:
+      dl_code = DLDataTypeCode::kDLUInt;
+      dt_size = 8;
+      break;
+    case TRITONSERVER_TYPE_UINT16:
+      dl_code = DLDataTypeCode::kDLUInt;
+      dt_size = 16;
+      break;
+    case TRITONSERVER_TYPE_UINT32:
+      dl_code = DLDataTypeCode::kDLUInt;
+      dt_size = 32;
+      break;
+    case TRITONSERVER_TYPE_UINT64:
+      dl_code = DLDataTypeCode::kDLUInt;
+      dt_size = 64;
+      break;
+    case TRITONSERVER_TYPE_INT8:
+      dl_code = DLDataTypeCode::kDLInt;
+      dt_size = 8;
+      break;
+    case TRITONSERVER_TYPE_INT16:
+      dl_code = DLDataTypeCode::kDLInt;
+      dt_size = 16;
+      break;
+    case TRITONSERVER_TYPE_INT32:
+      dl_code = DLDataTypeCode::kDLInt;
+      dt_size = 32;
+      break;
+    case TRITONSERVER_TYPE_INT64:
+      dl_code = DLDataTypeCode::kDLInt;
+      dt_size = 64;
+      break;
+    case TRITONSERVER_TYPE_FP16:
+      dl_code = DLDataTypeCode::kDLFloat;
+      dt_size = 16;
+      break;
+    case TRITONSERVER_TYPE_FP32:
+      dl_code = DLDataTypeCode::kDLFloat;
+      dt_size = 32;
+      break;
+    case TRITONSERVER_TYPE_FP64:
+      dl_code = DLDataTypeCode::kDLFloat;
+      dt_size = 64;
+      break;
+    case TRITONSERVER_TYPE_BYTES:
+      throw PythonBackendException(
+          "TYPE_BYTES tensors cannot be converted to DLPack.");
+    case TRITONSERVER_TYPE_BF16:
+      dl_code = DLDataTypeCode::kDLBfloat;
+      dt_size = 16;
+      break;
+
+    default:
+      throw PythonBackendException(
+          std::string("DType code \"") +
+          std::to_string(static_cast<int>(triton_dtype)) +
+          "\" is not supported.");
+  }
+
+  dl_dtype.code = dl_code;
+  dl_dtype.bits = dt_size;
+  return dl_dtype;
+}
+
+TRITONSERVER_DataType
+dlpack_to_triton_type(const DLDataType& data_type)
+{
+  if (data_type.lanes != 1) {
+    // lanes != 1 is not supported in Python backend.
+    return TRITONSERVER_TYPE_INVALID;
+  }
+
+  if (data_type.code == DLDataTypeCode::kDLFloat) {
+    if (data_type.bits == 16) {
+      return TRITONSERVER_TYPE_FP16;
+    } else if (data_type.bits == 32) {
+      return TRITONSERVER_TYPE_FP32;
+    } else if (data_type.bits == 64) {
+      return TRITONSERVER_TYPE_FP64;
+    }
+  }
+
+  if (data_type.code == DLDataTypeCode::kDLInt) {
+    if (data_type.bits == 8) {
+      return TRITONSERVER_TYPE_INT8;
+    } else if (data_type.bits == 16) {
+      return TRITONSERVER_TYPE_INT16;
+    } else if (data_type.bits == 32) {
+      return TRITONSERVER_TYPE_INT32;
+    } else if (data_type.bits == 64) {
+      return TRITONSERVER_TYPE_INT64;
+    }
+  }
+
+  if (data_type.code == DLDataTypeCode::kDLUInt) {
+    if (data_type.bits == 8) {
+      return TRITONSERVER_TYPE_UINT8;
+    } else if (data_type.bits == 16) {
+      return TRITONSERVER_TYPE_UINT16;
+    } else if (data_type.bits == 32) {
+      return TRITONSERVER_TYPE_UINT32;
+    } else if (data_type.bits == 64) {
+      return TRITONSERVER_TYPE_UINT64;
+    }
+  }
+
+  if (data_type.code == DLDataTypeCode::kDLBool) {
+    if (data_type.bits == 8) {
+      return TRITONSERVER_TYPE_BOOL;
+    }
+  }
+
+  if (data_type.code == DLDataTypeCode::kDLBfloat) {
+    if (data_type.bits != 16) {
+      throw PythonBackendException(
+          "Expected BF16 tensor to have 16 bits, but had: " +
+          std::to_string(data_type.bits));
+    }
+    return TRITONSERVER_TYPE_BF16;
+  }
+
+  return TRITONSERVER_TYPE_INVALID;
+}
+}}}  // namespace triton::backend::python
diff --git a/src/pb_stub_utils.h b/src/pb_stub_utils.h
new file mode 100644
index 00000000..6068fba9
--- /dev/null
+++ b/src/pb_stub_utils.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <dlpack/dlpack.h>
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include "triton/core/tritonserver.h"
+
+namespace py = pybind11;
+namespace triton { namespace backend { namespace python {
+
+/// Convert numpy dtype to triton dtype
+/// \param data_type numpy data type to be converted.
+/// \return equivalent triton dtype
+TRITONSERVER_DataType numpy_to_triton_type(py::object data_type);
+
+/// Convert triton dtype to numpy dtype
+/// \param data_type triton dtype to be converted.
+/// \return equivalent numpy data type.
+py::object triton_to_numpy_type(TRITONSERVER_DataType data_type);
+
+/// Convert triton dtype to dlpack dtype
+/// \param data_type triton dtype to be converted
+/// \return equivalent DLPack data type.
+DLDataType triton_to_dlpack_type(TRITONSERVER_DataType data_type);
+
+/// Convert dlpack type to triton type
+/// \param data_type triton dtype to be converted
+/// \return equivalent Triton dtype
+TRITONSERVER_DataType dlpack_to_triton_type(const DLDataType& data_type);
+
+/// Convert triton data to pybind data type.
+/// \param data_type triton dtype to be converted.
+/// \return equivalent pybind numpy dtype.
+py::dtype triton_to_pybind_dtype(TRITONSERVER_DataType data_type);
+}}}  // namespace triton::backend::python
diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc
new file mode 100644
index 00000000..26e77586
--- /dev/null
+++ b/src/pb_tensor.cc
@@ -0,0 +1,737 @@
+// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda.h>
+#endif  // TRITON_ENABLE_GPU
+
+#ifdef TRITON_PB_STUB
+#include "pb_stub.h"
+#include "pb_stub_utils.h"
+namespace py = pybind11;
+#endif
+#include "pb_tensor.h"
+
+// WAR for undefined ssize_t on Windows: https://stackoverflow.com/a/35368387
+#if defined(_MSC_VER)
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#endif
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace triton { namespace backend { namespace python {
+
+#ifdef TRITON_PB_STUB
+py::array
+deserialize_bytes_tensor_cpp(const uint8_t* data, size_t data_size)
+{
+  if (data_size == 0) {
+    py::module numpy = py::module::import("numpy");
+    return numpy.attr("empty")(0, py::dtype("object"));
+  }
+
+  // First pass: count the number of strings and calculate total size
+  size_t offset = 0;
+  size_t num_strings = 0;
+  size_t total_string_size = 0;
+
+  while (offset < data_size) {
+    if (offset + 4 > data_size) {
+      throw PythonBackendException(
+          "Invalid bytes tensor data: incomplete length field");
+    }
+
+    // Read 4-byte length (little-endian)
+    uint32_t length = *reinterpret_cast<const uint32_t*>(data + offset);
+    offset += 4;
+
+    if (offset + length > data_size) {
+      throw PythonBackendException(
+          "Invalid bytes tensor data: string extends beyond buffer");
+    }
+
+    num_strings++;
+    total_string_size += length;
+    offset += length;
+  }
+
+  // Create numpy array of objects using pybind11's numpy module
+  py::module numpy = py::module::import("numpy");
+  py::array result = numpy.attr("empty")(num_strings, py::dtype("object"));
+  auto result_ptr = static_cast<PyObject**>(result.request().ptr);
+
+  // Second pass: extract strings
+  offset = 0;
+  size_t string_index = 0;
+
+  while (offset < data_size) {
+    uint32_t length = *reinterpret_cast<const uint32_t*>(data + offset);
+    offset += 4;
+
+    // Create Python bytes object using pybind11
+    py::bytes bytes_obj(reinterpret_cast<const char*>(data + offset), length);
+    Py_INCREF(bytes_obj.ptr());  // Increment reference count
+    result_ptr[string_index] = bytes_obj.ptr();
+    string_index++;
+    offset += length;
+  }
+
+  return result;
+}
+
+PbTensor::PbTensor(const std::string& name, py::array& numpy_array)
+    : name_(name)
+{
+  if (name == "") {
+    throw PythonBackendException("Tensor name cannot be an empty string.");
+  }
+
+  dtype_ = numpy_to_triton_type(numpy_array.attr("dtype"));
+  memory_type_ = TRITONSERVER_MEMORY_CPU;
+  memory_type_id_ = 0;
+  dl_managed_tensor_ = nullptr;
+
+  bool is_contiguous =
+      numpy_array.attr("data").attr("c_contiguous").cast<bool>();
+  if (!is_contiguous) {
+    py::module numpy = py::module::import("numpy");
+    numpy_array = numpy.attr("ascontiguousarray")(numpy_array);
+  }
+  numpy_array_ = numpy_array;
+
+  if (dtype_ == TRITONSERVER_TYPE_BYTES) {
+    py::module triton_pb_utils =
+        py::module::import("triton_python_backend_utils");
+    numpy_array_serialized_ =
+        triton_pb_utils.attr("serialize_byte_tensor")(numpy_array);
+    memory_ptr_ = numpy_array_serialized_.request().ptr;
+    byte_size_ = numpy_array_serialized_.nbytes();
+  } else {
+    memory_ptr_ = numpy_array_.request().ptr;
+    byte_size_ = numpy_array_.nbytes();
+  }
+
+  // Initialize tensor dimension
+  size_t dims_count = numpy_array_.ndim();
+
+  const ssize_t* numpy_shape = numpy_array_.shape();
+  for (size_t i = 0; i < dims_count; i++) {
+    dims_.push_back(numpy_shape[i]);
+  }
+}
+
+PbTensor::PbTensor(
+    const std::string& name, py::array& numpy_array,
+    TRITONSERVER_DataType dtype)
+    : name_(name)
+{
+  if (name == "") {
+    throw PythonBackendException("Tensor name cannot be an empty string.");
+  }
+
+  if (numpy_to_triton_type(numpy_array.attr("dtype")) != dtype) {
+    numpy_array = numpy_array.attr("view")(triton_to_numpy_type(dtype));
+  }
+  bool is_contiguous =
+      numpy_array.attr("data").attr("c_contiguous").cast<bool>();
+  if (!is_contiguous) {
+    py::module numpy = py::module::import("numpy");
+    numpy_array = numpy.attr("ascontiguousarray")(numpy_array);
+  }
+  numpy_array_ = numpy_array;
+
+  if (dtype == TRITONSERVER_TYPE_BYTES) {
+    py::module triton_pb_utils =
+        py::module::import("triton_python_backend_utils");
+    numpy_array_serialized_ =
+        triton_pb_utils.attr("serialize_byte_tensor")(numpy_array);
+    memory_ptr_ = numpy_array_serialized_.request().ptr;
+    byte_size_ = numpy_array_serialized_.nbytes();
+
+  } else {
+    memory_ptr_ = numpy_array_.request().ptr;
+    byte_size_ = numpy_array_.nbytes();
+  }
+  memory_type_ = TRITONSERVER_MEMORY_CPU;
+  dtype_ = dtype;
+
+  // Initialize tensor dimension
+  size_t dims_count = numpy_array_.ndim();
+
+  const ssize_t* numpy_shape = numpy_array_.shape();
+  for (size_t i = 0; i < dims_count; i++) {
+    dims_.push_back(numpy_shape[i]);
+  }
+  memory_type_id_ = 0;
+  dl_managed_tensor_ = nullptr;
+}
+#endif  // TRITON_PB_STUB
+
+PbTensor::PbTensor(
+    const std::string& name, const std::vector<int64_t>& dims,
+    TRITONSERVER_DataType dtype, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id, void* memory_ptr, uint64_t byte_size,
+    DLManagedTensor* dl_managed_tensor)
+{
+  if (name == "") {
+    throw PythonBackendException("Tensor name cannot be an empty string.");
+  }
+
+  name_ = name;
+  memory_ptr_ = memory_ptr;
+  memory_type_ = memory_type;
+  memory_type_id_ = memory_type_id;
+  dtype_ = dtype;
+  dims_ = dims;
+
+#ifdef TRITON_PB_STUB
+  if (memory_type_ == TRITONSERVER_MEMORY_CPU ||
+      memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) {
+    if (dtype == TRITONSERVER_TYPE_BF16) {
+      // No native numpy representation for BF16. DLPack should be used instead.
+      numpy_array_ = py::none();
+    } else if (dtype != TRITONSERVER_TYPE_BYTES) {
+      py::object numpy_array =
+          py::array(triton_to_pybind_dtype(dtype_), dims_, (void*)memory_ptr_);
+      numpy_array_ = numpy_array.attr("view")(triton_to_numpy_type(dtype_));
+    } else {
+      py::object numpy_array = deserialize_bytes_tensor_cpp(
+          static_cast<const uint8_t*>(memory_ptr_), byte_size_);
+      numpy_array_ = numpy_array.attr("reshape")(dims_);
+    }
+  } else {
+    numpy_array_ = py::none();
+  }
+#endif
+
+  byte_size_ = byte_size;
+  dl_managed_tensor_ = dl_managed_tensor;
+}
+
+bool
+PbTensor::IsCPU() const
+{
+  if (memory_type_ == TRITONSERVER_MEMORY_CPU ||
+      memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+TRITONSERVER_MemoryType
+PbTensor::MemoryType() const
+{
+  return memory_type_;
+}
+
+int64_t
+PbTensor::MemoryTypeId() const
+{
+  return memory_type_id_;
+}
+
+uint64_t
+PbTensor::ByteSize() const
+{
+  return byte_size_;
+}
+
+const std::vector<int64_t>&
+PbTensor::Dims() const
+{
+  return dims_;
+}
+
+void
+PbTensor::SetMemory(std::unique_ptr<PbMemory>&& memory)
+{
+  pb_memory_ = std::move(memory);
+  memory_type_ = pb_memory_->MemoryType();
+  memory_type_id_ = pb_memory_->MemoryTypeId();
+  byte_size_ = pb_memory_->ByteSize();
+  memory_ptr_ = pb_memory_->DataPtr();
+}
+
+#ifdef TRITON_PB_STUB
+void
+delete_unused_dltensor(PyObject* dlp)
+{
+  if (PyCapsule_IsValid(dlp, "dltensor")) {
+    DLManagedTensor* dl_managed_tensor =
+        static_cast<DLManagedTensor*>(PyCapsule_GetPointer(dlp, "dltensor"));
+    dl_managed_tensor->deleter(dl_managed_tensor);
+  }
+}
+
+
+std::shared_ptr<PbTensor>
+PbTensor::FromNumpy(const std::string& name, py::array& numpy_array)
+{
+  return std::make_shared<PbTensor>(name, numpy_array);
+}
+
+DLDeviceType
+PbTensor::DeviceType()
+{
+  DLDeviceType device_type{};
+
+  switch (memory_type_) {
+    case TRITONSERVER_MEMORY_GPU:
+      device_type = DLDeviceType::kDLCUDA;
+      break;
+    case TRITONSERVER_MEMORY_CPU:
+      device_type = DLDeviceType::kDLCPU;
+      break;
+    case TRITONSERVER_MEMORY_CPU_PINNED:
+      device_type = DLDeviceType::kDLCUDAHost;
+      break;
+  }
+
+  return device_type;
+}
+
+py::capsule
+PbTensor::DLPack(const py::object& stream)
+{
+  // Here external tensor requests PbTensor's `__dlpack__` method to provide
+  // a PyCapsule. By the design of PbTensor, in a GPU case no pending work
+  // is scheduled to work with PbTensor's data and we can simply pass
+  // the capsule without a synchronization.
+  return this->ToDLPack();
+}
+
+py::capsule
+PbTensor::ToDLPack()
+{
+  if (dtype_ == TRITONSERVER_TYPE_BYTES) {
+    throw PythonBackendException(
+        "DLPack does not have support for string tensors.");
+  }
+
+  DLManagedTensor* dlpack_tensor = new DLManagedTensor;
+  dlpack_tensor->dl_tensor.ndim = dims_.size();
+  dlpack_tensor->dl_tensor.byte_offset = 0;
+  dlpack_tensor->dl_tensor.data = memory_ptr_;
+  dlpack_tensor->dl_tensor.shape = &dims_[0];
+  dlpack_tensor->dl_tensor.strides = nullptr;
+  dlpack_tensor->manager_ctx = this;
+  dlpack_tensor->deleter = [](DLManagedTensor* m) {
+    // We need to acquire GIL since the framework that deleted the dlpack tensor
+    // may not have acquired GIL when calling this function.
+    py::gil_scoped_acquire gil;
+    if (m->manager_ctx == nullptr) {
+      return;
+    }
+
+    PbTensor* tensor = reinterpret_cast<PbTensor*>(m->manager_ctx);
+    py::handle tensor_handle = py::cast(tensor);
+    tensor_handle.dec_ref();
+    free(m);
+  };
+
+  PbTensor* tensor = reinterpret_cast<PbTensor*>(this);
+  py::handle tensor_handle = py::cast(tensor);
+
+  // Increase the reference count by one to make sure that the DLPack
+  // representation doesn't become invalid when the tensor object goes out of
+  // scope.
+  tensor_handle.inc_ref();
+
+  dlpack_tensor->dl_tensor.device.device_id = memory_type_id_;
+  dlpack_tensor->dl_tensor.device.device_type = this->DeviceType();
+  dlpack_tensor->dl_tensor.dtype = triton_to_dlpack_type(dtype_);
+
+  return py::capsule(
+      static_cast<void*>(dlpack_tensor), "dltensor", &delete_unused_dltensor);
+}
+
+std::pair<int32_t, int64_t>
+PbTensor::DLPackDevice()
+{
+  return std::pair<int32_t, int64_t>(this->DeviceType(), memory_type_id_);
+}
+
+#endif  // TRITON_PB_STUB
+
+void
+PbTensor::DeleteDLPack()
+{
+  if (dl_managed_tensor_ != nullptr) {
+    dl_managed_tensor_->deleter(dl_managed_tensor_);
+    dl_managed_tensor_ = nullptr;
+  }
+}
+
+std::unique_ptr<PbMemory>&
+PbTensor::Memory()
+{
+  return pb_memory_;
+}
+
+#ifdef TRITON_PB_STUB
+std::shared_ptr<PbTensor>
+PbTensor::FromDLPack(const std::string& name, const py::object& tensor)
+{
+  if (name == "") {
+    throw PythonBackendException("Tensor name cannot be an empty string.");
+  }
+  if (py::isinstance<py::capsule>(tensor)) {
+    return FromDLPackCapsule(name, tensor);
+  }
+
+  if (!py::hasattr(tensor, "__dlpack__") ||
+      !py::hasattr(tensor, "__dlpack_device__")) {
+    throw PythonBackendException(
+        "Provided tensor is not supported. Tensor must be a DLPack capsule \
+        or have `__dlpack__` and `__dlpack_device__` attributes");
+  }
+
+  auto capsule_device_info =
+      tensor.attr("__dlpack_device__")().cast<std::pair<int32_t, int64_t>>();
+  if (capsule_device_info.first == DLDeviceType::kDLCUDA) {
+#ifdef TRITON_ENABLE_GPU
+    int current_device;
+    cudaError_t err = cudaGetDevice(&current_device);
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    if (err != cudaSuccess) {
+      throw PythonBackendException("Failed to get current CUDA device id.");
+    }
+    ScopedSetDevice scoped_set_device(capsule_device_info.second);
+
+    bool overridden = (current_device != capsule_device_info.second);
+    cudaStream_t proxy_stream = stub->GetProxyStream(current_device);
+
+    // Array API requirements for the stream argument:
+    // stream = 1 the legacy default stream (in this case should
+    // synchronize on CUDA stream 0)
+    // For CPU, `stream=None` is the only accepted argument
+    // according to array API. For GPU, when `stream=None`  producer
+    // must assume the legacy default stream. Reference:
+    // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html
+    auto ptr_to_tensor = FromDLPackCapsule(
+        name, tensor.attr("__dlpack__")(
+                  py::arg("stream") =
+                      py::int_(reinterpret_cast<int64_t>(proxy_stream))));
+
+    // In case there is a pending job on the data, where this capsule
+    // is pointing to, we need to wait for it to finish before returning
+    // capsule.
+    // We synchronize on the proxy stream explicitly since that what we
+    // pass to external tensor's `__dlpack__` method.
+    err = cudaStreamSynchronize(proxy_stream);
+    if (err != cudaSuccess) {
+      throw PythonBackendException(
+          "Failed to synchronize CUDA device with id " +
+          std::to_string(
+              overridden ? capsule_device_info.second : current_device));
+    }
+
+    return ptr_to_tensor;
+#else
+    throw PythonBackendException(
+        "DLPack capsule passed pointer to memory allocated on GPU device, \
+          when GPU is not available");
+#endif
+  } else if (
+      capsule_device_info.first != DLDeviceType::kDLCPU &&
+      capsule_device_info.first != DLDeviceType::kDLCUDAHost) {
+    throw PythonBackendException(
+        "DLDevice type " + std::to_string(capsule_device_info.first) +
+        " is not support by Python backend.");
+  }
+
+  // If data is located on a CPU, `stream=None` is the only accepted argument
+  // according to array API.
+  // Reference:
+  // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html
+  return FromDLPackCapsule(
+      name, tensor.attr("__dlpack__")(py::arg("stream") = py::none()));
+}
+
+std::shared_ptr<PbTensor>
+PbTensor::FromDLPackCapsule(
+    const std::string& name, const py::capsule& dlpack_tensor)
+{
+  DLManagedTensor* dl_managed_tensor =
+      static_cast<DLManagedTensor*>(dlpack_tensor.get_pointer());
+
+  void* memory_ptr = dl_managed_tensor->dl_tensor.data;
+  memory_ptr = reinterpret_cast<char*>(memory_ptr) +
+               dl_managed_tensor->dl_tensor.byte_offset;
+
+  int64_t* strides = dl_managed_tensor->dl_tensor.strides;
+
+  int ndim = dl_managed_tensor->dl_tensor.ndim;
+  std::vector<int64_t> dims(
+      dl_managed_tensor->dl_tensor.shape,
+      dl_managed_tensor->dl_tensor.shape + ndim);
+
+  // Check if the input is contiguous and in C order
+  if (strides != nullptr) {
+    int64_t calculated_stride{1};
+    bool is_contiguous_c_order = true;
+    for (size_t i = 1; i < dims.size(); i++) {
+      if (dims[ndim - i] != 1) {
+        if (strides[ndim - i] != calculated_stride) {
+          is_contiguous_c_order = false;
+          break;
+        }
+
+        calculated_stride *= dims[ndim - i];
+      }
+    }
+
+    if (!is_contiguous_c_order) {
+      throw PythonBackendException(
+          "DLPack tensor is not contiguous. Only contiguous DLPack "
+          "tensors that are stored in C-Order are supported.");
+    }
+  }
+
+  TRITONSERVER_MemoryType memory_type;
+  int64_t memory_type_id;
+
+  switch (dl_managed_tensor->dl_tensor.device.device_type) {
+    case DLDeviceType::kDLCUDA:
+      memory_type = TRITONSERVER_MEMORY_GPU;
+      memory_type_id = dl_managed_tensor->dl_tensor.device.device_id;
+      break;
+    case DLDeviceType::kDLCPU:
+      memory_type = TRITONSERVER_MEMORY_CPU;
+      memory_type_id = 0;
+      break;
+    case DLDeviceType::kDLCUDAHost:
+      memory_type = TRITONSERVER_MEMORY_CPU;
+      memory_type_id = 0;
+      break;
+    default:
+      throw PythonBackendException(
+          "DLDevice type " +
+          std::to_string(dl_managed_tensor->dl_tensor.device.device_type) +
+          " is not support by Python backend.");
+      break;
+  }
+
+  TRITONSERVER_DataType dtype =
+      dlpack_to_triton_type(dl_managed_tensor->dl_tensor.dtype);
+
+  // Calculate tensor size.
+  uint64_t byte_size = 1;
+  for (auto& dim : dims) {
+    byte_size *= dim;
+  }
+  byte_size *= (dl_managed_tensor->dl_tensor.dtype.bits + 7) / 8;
+
+  PyCapsule_SetName(dlpack_tensor.ptr(), "used_dlpack");
+  return std::make_unique<PbTensor>(
+      name, dims, dtype, memory_type, memory_type_id, memory_ptr, byte_size,
+      dl_managed_tensor);
+}
+#endif  // TRITON_PB_STUB
+
+PbTensor::~PbTensor() noexcept(false)
+{
+  pb_memory_.reset();
+  DeleteDLPack();
+
+#ifdef TRITON_PB_STUB
+  {
+    py::gil_scoped_acquire acquire;
+    py::array numpy_array_local(std::move(numpy_array_));
+    py::array numpy_array_serialized_local(std::move(numpy_array_serialized_));
+  }
+#endif
+}
+
+const std::string&
+PbTensor::Name() const
+{
+  return name_;
+}
+
+#ifdef TRITON_PB_STUB
+const py::array*
+PbTensor::AsNumpy() const
+{
+  if (!IsCPU()) {
+    throw PythonBackendException(
+        "Tensor is stored in GPU and cannot be converted to NumPy.");
+  }
+
+  if (dtype_ == TRITONSERVER_TYPE_BF16) {
+    throw PythonBackendException(
+        "Tensor dtype is BF16 and cannot be converted to NumPy. Use "
+        "to_dlpack() and from_dlpack() instead.");
+  }
+
+  return &numpy_array_;
+}
+#endif  // TRITON_PB_STUB
+
+void
+PbTensor::SaveToSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu)
+{
+  if (!tensor_shm_.data_) {
+    uint64_t byte_size;
+    if (!pb_memory_) {
+      byte_size = sizeof(TensorShm) + sizeof(int64_t) * dims_.size() +
+                  PbString::ShmStructSize(name_) +
+                  PbMemory::ShmStructSize(memory_type_, byte_size_);
+
+    } else {
+      byte_size = sizeof(TensorShm) + sizeof(int64_t) * dims_.size() +
+                  PbString::ShmStructSize(name_);
+    }
+    tensor_shm_ = shm_pool->Construct<char>(byte_size);
+
+    tensor_shm_ptr_ = reinterpret_cast<TensorShm*>(tensor_shm_.data_.get());
+    tensor_shm_ptr_->dtype = dtype_;
+    tensor_shm_ptr_->dims_count = dims_.size();
+    shm_handle_ = tensor_shm_.handle_;
+
+    dims_shm_ptr_ = reinterpret_cast<int64_t*>(
+        reinterpret_cast<char*>(tensor_shm_ptr_) + sizeof(TensorShm));
+
+    // Write the dimensions data to shared memory.
+    for (size_t i = 0; i < dims_.size(); i++) {
+      dims_shm_ptr_[i] = dims_[i];
+    }
+
+    std::size_t name_offset =
+        sizeof(TensorShm) + sizeof(int64_t) * dims_.size();
+    name_shm_ = PbString::Create(
+        name_, reinterpret_cast<char*>(tensor_shm_ptr_) + name_offset,
+        shm_handle_ + name_offset);
+    std::size_t pb_memory_offset = name_offset + PbString::ShmStructSize(name_);
+
+    if (!pb_memory_) {
+      pb_memory_ = PbMemory::Create(
+          shm_pool, memory_type_, memory_type_id_, byte_size_,
+          reinterpret_cast<char*>(memory_ptr_),
+          reinterpret_cast<char*>(tensor_shm_ptr_) + pb_memory_offset,
+          shm_handle_ + pb_memory_offset, copy_gpu);
+      tensor_shm_ptr_->memory = 0;
+    } else {
+      tensor_shm_ptr_->memory = pb_memory_->ShmHandle();
+    }
+
+    memory_ptr_ = pb_memory_->DataPtr();
+  }
+}
+
+std::unique_ptr<PbTensor>
+PbTensor::LoadFromSharedMemory(
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    bi::managed_external_buffer::handle_t tensor_handle, bool open_cuda_handle)
+{
+  AllocatedSharedMemory<char> tensor_shm = shm_pool->Load<char>(tensor_handle);
+  TensorShm* tensor_shm_ptr =
+      reinterpret_cast<TensorShm*>(tensor_shm.data_.get());
+  size_t name_offset =
+      sizeof(TensorShm) + sizeof(int64_t) * tensor_shm_ptr->dims_count;
+  std::unique_ptr<PbString> name_shm = PbString::LoadFromSharedMemory(
+      tensor_handle + name_offset, tensor_shm.data_.get() + name_offset);
+
+  std::unique_ptr<PbMemory> pb_memory;
+  if (tensor_shm_ptr->memory == 0) {
+    std::size_t pb_memory_offset = name_offset + name_shm->Size();
+    pb_memory = PbMemory::LoadFromSharedMemory(
+        shm_pool, pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset,
+        open_cuda_handle);
+  } else {
+    pb_memory = PbMemory::LoadFromSharedMemory(
+        shm_pool, tensor_shm_ptr->memory, open_cuda_handle);
+  }
+
+  return std::unique_ptr<PbTensor>(
+      new PbTensor(tensor_shm, name_shm, pb_memory));
+}
+
+TRITONSERVER_DataType
+PbTensor::TritonDtype() const
+{
+  return dtype_;
+}
+
+void*
+PbTensor::DataPtr()
+{
+  return memory_ptr_;
+}
+
+bi::managed_external_buffer::handle_t
+PbTensor::ShmHandle()
+{
+  return shm_handle_;
+}
+
+PbTensor::PbTensor(
+    AllocatedSharedMemory<char>& tensor_shm,
+    std::unique_ptr<PbString>& name_shm, std::unique_ptr<PbMemory>& pb_memory)
+    : tensor_shm_(std::move(tensor_shm)), name_shm_(std::move(name_shm)),
+      pb_memory_(std::move(pb_memory))
+{
+  tensor_shm_ptr_ = reinterpret_cast<TensorShm*>(tensor_shm_.data_.get());
+  dims_shm_ptr_ = reinterpret_cast<int64_t*>(
+      reinterpret_cast<char*>(tensor_shm_ptr_) + sizeof(TensorShm));
+
+  name_ = name_shm_->String();
+  dims_ = std::vector<int64_t>(
+      dims_shm_ptr_, dims_shm_ptr_ + tensor_shm_ptr_->dims_count);
+  dtype_ = tensor_shm_ptr_->dtype;
+  dl_managed_tensor_ = nullptr;
+  byte_size_ = pb_memory_->ByteSize();
+  memory_ptr_ = pb_memory_->DataPtr();
+  memory_type_ = pb_memory_->MemoryType();
+  memory_type_id_ = pb_memory_->MemoryTypeId();
+  shm_handle_ = tensor_shm_.handle_;
+
+#ifdef TRITON_PB_STUB
+  if (memory_type_ == TRITONSERVER_MEMORY_CPU ||
+      memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) {
+    if (dtype_ == TRITONSERVER_TYPE_BF16) {
+      // No native numpy representation for BF16. DLPack should be used instead.
+      numpy_array_ = py::none();
+    } else if (dtype_ != TRITONSERVER_TYPE_BYTES) {
+      py::object numpy_array =
+          py::array(triton_to_pybind_dtype(dtype_), dims_, (void*)memory_ptr_);
+      numpy_array_ = numpy_array.attr("view")(triton_to_numpy_type(dtype_));
+    } else {
+      py::object numpy_array = deserialize_bytes_tensor_cpp(
+          static_cast<const uint8_t*>(memory_ptr_), byte_size_);
+      numpy_array_ = numpy_array.attr("reshape")(dims_);
+    }
+  } else {
+    numpy_array_ = py::none();
+  }
+#endif
+}
+}}}  // namespace triton::backend::python
diff --git a/src/pb_tensor.h b/src/pb_tensor.h
new file mode 100644
index 00000000..4f97b643
--- /dev/null
+++ b/src/pb_tensor.h
@@ -0,0 +1,260 @@
+// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+#include <dlpack/dlpack.h>
+
+#ifdef TRITON_PB_STUB
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+namespace py = pybind11;
+#endif
+
+#include <functional>
+#include <string>
+
+#include "pb_memory.h"
+#include "pb_string.h"
+#include "pb_utils.h"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/core/tritonserver.h"
+
+namespace triton { namespace backend { namespace python {
+
+//
+// Represents a Tensor object in shared memory.
+//
+struct TensorShm {
+  // Handle for the pointer data in shared memory.
+  bi::managed_external_buffer::handle_t memory;
+  TRITONSERVER_DataType dtype;
+  size_t dims_count;
+};
+
+// PbTensor class is the representation of Triton tensors inside Python backend.
+class PbTensor {
+ public:
+#ifdef TRITON_PB_STUB
+  /// Create a PbTensor using a numpy array
+  /// \param name The name of the tensor
+  /// \param numpy_array Numpy array to use for the initialization of the tensor
+  PbTensor(const std::string& name, py::array& numpy_array);
+
+  /// Create a PbTensor using a numpy array. This constructor is used for types
+  /// that are not natively available in C++ such as float16. This constructor
+  /// will fix the type of the NumPy array to match the Triton dtype.
+  /// \param name The name of the tensor
+  /// \param numpy_array Numpy array to use for the initialization of the tensor
+  /// \param dtype The triton dtype
+  PbTensor(
+      const std::string& name, py::array& numpy_array,
+      TRITONSERVER_DataType dtype);
+#endif
+
+  /// Create a PbTensor from raw pointer. This constructor is used for
+  /// interfacing with DLPack tensors.
+  /// \param name The name of the tensor
+  /// \param dims Tensor dimensions
+  /// \param dtype Triton dtype
+  /// \param memory_type The memory type of the tensor
+  /// \param memory_type_id The memory type_id of the tensor
+  /// \param memory_ptr Pointer to the location of the data. Data must be
+  /// contiguous and in C-order.
+  /// \param byte_size Total number of bytes that the tensor uses.
+  /// \param shm_handle The shared memory handle of pointer if it is stored in
+  /// shared memory.
+  PbTensor(
+      const std::string& name, const std::vector<int64_t>& dims,
+      TRITONSERVER_DataType dtype, TRITONSERVER_MemoryType memory_type,
+      int64_t memory_type_id, void* memory_ptr, uint64_t byte_size,
+      DLManagedTensor* dl_managed_tensor = nullptr);
+
+  /// This constructor is used when loading the tensor from shared memory.
+  /// \param tensor_shm The name of the tensor
+  /// \param dims_shm Tensor dimensions
+  /// \param pb_string Triton dtype
+  PbTensor(
+      AllocatedSharedMemory<char>& tensor_shm,
+      std::unique_ptr<PbString>& name_shm,
+      std::unique_ptr<PbMemory>& pb_memory);
+
+  // Copying tensor objects is not allowed.
+  DISALLOW_COPY_AND_ASSIGN(PbTensor);
+
+#ifdef TRITON_PB_STUB
+  /// Construct a Python backend tensor from an
+  /// external tensor.
+  /// \param dlpack source dlpack tensor
+  /// \param name name of the tensor
+  static std::shared_ptr<PbTensor> FromDLPack(
+      const std::string& name, const py::object& dlpack);
+
+  /// Construct a Python backend tensor using a DLPack
+  /// capsule.
+  static std::shared_ptr<PbTensor> FromDLPackCapsule(
+      const std::string& name, const py::capsule& dlpack);
+
+  /// Construct a Python backend tensor using a NumPy object.
+  /// \param numpy_array Numpy array
+  /// \param name name of the tensor
+  static std::shared_ptr<PbTensor> FromNumpy(
+      const std::string& name, py::array& numpy_array);
+
+  /// Get device type in DLPack format.
+  DLDeviceType DeviceType();
+
+  /// Exports tensor for consumption by `from_dlpack()` as a DLPack capsule.
+  /// \param stream  a Python integer representing a pointer to a stream,
+  ///                on devices that support streams
+  /// \return Capsule object containing pointer to a DLPack object.
+  py::capsule DLPack(const py::object& stream);
+
+  /// Get a PyCapsule object containing the DLPack representation of the tensor.
+  /// \return Capsule object containing pointer to a DLPack object.
+  py::capsule ToDLPack();
+
+  /// Returns device type and device ID.
+  /// Meant for use within `from_dlpack()`.
+  /// \return a pair (device_type, device_id).
+  std::pair<int32_t, int64_t> DLPackDevice();
+#endif
+
+  /// Get the name of the tensor
+  /// \return name of the tensor.
+  const std::string& Name() const;
+
+  /// Set the name of the tensor
+  /// \param name Name of the tensor.
+  void SetName(const std::string& name);
+
+  /// Get the shared memory handle corresponding to this tensor
+  /// \return returns the shared memory handle.
+  bi::managed_external_buffer::handle_t ShmHandle();
+
+  /// Load the tensor object from shared memory.
+  /// \param shm_pool The shared memory manager object
+  /// \param tensor_handle The handle of the object in shared memory.
+  /// \param open_cuda_handle If the tensor is in GPU, setting this option to
+  /// true will call cudaIpcOpenMemHandle on it. In the main process this option
+  /// should be set to false because we never want to call cudaIpcOpenMemHandle
+  /// in the main process.
+  /// \return returns the tensor loaded from shared memory.
+  static std::unique_ptr<PbTensor> LoadFromSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      bi::managed_external_buffer::handle_t tensor_handle,
+      bool open_cuda_handle);
+
+#ifdef TRITON_PB_STUB
+  /// Get NumPy representation of the tensor.
+  /// \throw If the tensor is stored in GPU, an exception is thrown
+  /// \return NumPy representation of the Tensor
+  const py::array* AsNumpy() const;
+#endif
+
+  /// Save tensor inside shared memory.
+  void SaveToSharedMemory(
+      std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu);
+
+  /// Get the triton dtype
+  /// \return Triton dtype
+  TRITONSERVER_DataType TritonDtype() const;
+
+  /// Get the data ptr
+  /// \return Get the raw pointer.
+  void* DataPtr();
+
+  /// This function will be automatically called by the stub when the tensor is
+  /// no longer required.
+  void DeleteDLPack();
+
+  /// Tells whether the Tensor is stored in CPU or not.
+  /// \return A boolean value indicating whether the tensor is stored in CPU
+  /// or not.
+  bool IsCPU() const;
+
+  /// Get the total byte size of the tensor.
+  uint64_t ByteSize() const;
+
+  /// Get the triton memory type of the Tensor.
+  /// \return the memory type of the tensor.
+  TRITONSERVER_MemoryType MemoryType() const;
+
+  /// Get a mutable reference to the MemoryType.
+  /// \return the pointer to the memory type of the tensor.
+  TRITONSERVER_MemoryType* MutableMemoryType();
+
+  /// Get the triton memory type of the Tensor.
+  /// \return the memory type of the tensor.
+  int64_t MemoryTypeId() const;
+
+  /// Get the dimensions of the tensor
+  /// \return A vector containing the tensor dimensions.
+  const std::vector<int64_t>& Dims() const;
+
+  /// Get the underlying memory
+  std::unique_ptr<PbMemory>& Memory();
+
+  /// Set the underlying memory
+  void SetMemory(std::unique_ptr<PbMemory>&& memory);
+
+  PbTensor();
+
+  /// Destructor
+  ~PbTensor() noexcept(false);
+
+ private:
+  std::string name_;
+#ifdef TRITON_PB_STUB
+  py::array numpy_array_;
+  // Storing the serialized version of the numpy array
+  py::array numpy_array_serialized_;
+#endif
+  TRITONSERVER_DataType dtype_;
+  void* memory_ptr_;
+  int64_t memory_type_id_;
+  std::vector<int64_t> dims_;
+  TRITONSERVER_MemoryType memory_type_;
+  uint64_t byte_size_;
+  DLManagedTensor* dl_managed_tensor_;
+
+  bi::managed_external_buffer::handle_t shm_handle_;
+
+  AllocatedSharedMemory<char> tensor_shm_;
+  TensorShm* tensor_shm_ptr_;
+  int64_t* dims_shm_ptr_;
+  std::unique_ptr<PbString> name_shm_;
+
+  // The pointer is null when the object is not stored in shared memory.
+  std::unique_ptr<PbMemory> pb_memory_;
+};
+}}}  // namespace triton::backend::python
diff --git a/src/pb_utils.cc b/src/pb_utils.cc
index d3db2e61..79b45ec2 100644
--- a/src/pb_utils.cc
+++ b/src/pb_utils.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -26,270 +26,415 @@
 
 #include "pb_utils.h"
 
-#include <archive.h>
-#include <archive_entry.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
 #include <sys/stat.h>
-#include <sys/types.h>
+
+#include <fstream>
+
+#ifdef _WIN32
+#include <windows.h>
+
+#include <algorithm>
+#else
+#include <dlfcn.h>
 #include <unistd.h>
-#include <cerrno>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include "shm_manager.h"
+#endif
+
+#ifndef _WIN32
+extern char** environ;
+#endif
+
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#endif
 
 namespace triton { namespace backend { namespace python {
 
-#define THROW_IF_ERROR(MSG, X)           \
-  do {                                   \
-    int return__ = (X);                  \
-    if (return__ != 0) {                 \
-      throw PythonBackendException(MSG); \
-    }                                    \
-  } while (false)
+#ifdef TRITON_ENABLE_GPU
+
+CUDAHandler::CUDAHandler()
+{
+  dl_open_handle_ = LoadSharedObject("libcuda.so");
+
+  // If libcuda.so is successfully opened, it must be able to find
+  // "cuPointerGetAttribute", "cuGetErrorString", and
+  // "cuDevicePrimaryCtxGetState" symbols.
+  if (dl_open_handle_ != nullptr) {
+    void* cu_pointer_get_attribute_fn = LocateSymbol("cuPointerGetAttribute");
+    if (cu_pointer_get_attribute_fn == nullptr) {
+      throw PythonBackendException(
+          std::string("Failed to locate 'cuPointerGetAttribute'. Error: ") +
+          LocateSymbolError());
+    }
+    *((void**)&cu_pointer_get_attribute_fn_) = cu_pointer_get_attribute_fn;
+
+    void* cu_get_error_string_fn = LocateSymbol("cuGetErrorString");
+    if (cu_get_error_string_fn == nullptr) {
+      throw PythonBackendException(
+          std::string("Failed to locate 'cuGetErrorString'. Error: ") +
+          LocateSymbolError());
+    }
+    *((void**)&cu_get_error_string_fn_) = cu_get_error_string_fn;
+
+    void* cu_init_fn = LocateSymbol("cuInit");
+    if (cu_init_fn == nullptr) {
+      throw PythonBackendException(
+          std::string("Failed to locate 'cuInit'. Error: ") +
+          LocateSymbolError());
+    }
+    *((void**)&cu_init_fn_) = cu_init_fn;
+
+    void* cu_device_primary_ctx_get_state_fn =
+        LocateSymbol("cuDevicePrimaryCtxGetState");
+    if (cu_device_primary_ctx_get_state_fn == nullptr) {
+      throw PythonBackendException(
+          std::string(
+              "Failed to locate 'cuDevicePrimaryCtxGetState'. Error: ") +
+          LocateSymbolError());
+    }
+    *((void**)&cu_device_primary_ctx_get_state_fn_) =
+        cu_device_primary_ctx_get_state_fn;
+
+    // Initialize the driver API.
+    CUresult cuda_err = (*cu_init_fn_)(0 /* flags */);
+    if (cuda_err != CUDA_SUCCESS) {
+      const char* error_string;
+      (*cu_get_error_string_fn_)(cuda_err, &error_string);
+      error_str_ = std::string("failed to call cuInit: ") + error_string;
+      CloseLibrary();
+      dl_open_handle_ = nullptr;
+    }
+  }
+}
 
 void
-LoadStringFromSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t shm_offset, char*& str)
+CUDAHandler::PointerGetAttribute(
+    CUdeviceptr* start_address, CUpointer_attribute attribute,
+    CUdeviceptr dev_ptr)
 {
-  String* string;
-  shm_pool->MapOffset((char**)&string, sizeof(String), shm_offset);
-  shm_pool->MapOffset((char**)&str, string->length, string->data);
+  CUresult cuda_err =
+      (*cu_pointer_get_attribute_fn_)(start_address, attribute, dev_ptr);
+  if (cuda_err != CUDA_SUCCESS) {
+    const char* error_string;
+    (*cu_get_error_string_fn_)(cuda_err, &error_string);
+    throw PythonBackendException(
+        std::string(
+            "failed to get cuda pointer device attribute: " +
+            std::string(error_string))
+            .c_str());
+  }
+}
+
+bool
+CUDAHandler::IsAvailable()
+{
+  return dl_open_handle_ != nullptr;
 }
 
 void
-SaveStringToSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t& shm_offset, const char* str)
+CUDAHandler::OpenCudaHandle(
+    int64_t memory_type_id, cudaIpcMemHandle_t* cuda_mem_handle,
+    void** data_ptr)
 {
-  String* string_shm;
-  shm_pool->Map((char**)&string_shm, sizeof(String), shm_offset);
-  string_shm->length = strlen(str) + 1;
-
-  char* string_data;
-  off_t str_data_offset;
-  shm_pool->Map((char**)&string_data, string_shm->length, str_data_offset);
-  string_shm->data = str_data_offset;
-  strcpy(string_data, str);
+  std::lock_guard<std::mutex> guard{mu_};
+  ScopedSetDevice scoped_set_device(memory_type_id);
+
+  cudaError_t err = cudaIpcOpenMemHandle(
+      data_ptr, *cuda_mem_handle, cudaIpcMemLazyEnablePeerAccess);
+  if (err != cudaSuccess) {
+    throw PythonBackendException(
+        std::string("Failed to open the cudaIpcHandle. error: ") +
+        cudaGetErrorString(err));
+  }
 }
 
 void
-SaveRawDataToSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t& raw_data_offset,
-    char*& raw_data_ptr, TRITONSERVER_MemoryType memory_type,
-    int memory_type_id, uint64_t byte_size)
+CUDAHandler::CloseCudaHandle(int64_t memory_type_id, void* data_ptr)
 {
-  // raw data
-  RawData* raw_data;
-  shm_pool->Map((char**)&raw_data, sizeof(RawData), raw_data_offset);
+  std::lock_guard<std::mutex> guard{mu_};
+  int current_device;
 
-  raw_data->memory_type = memory_type;
-  raw_data->memory_type_id = memory_type_id;
-  raw_data->byte_size = byte_size;
+  // Save the previous device
+  cudaError_t err = cudaGetDevice(&current_device);
+  if (err != cudaSuccess) {
+    throw PythonBackendException(
+        std::string("Failed to get the current CUDA device. error: ") +
+        cudaGetErrorString(err));
+  }
 
-  off_t buffer_offset;
-  shm_pool->Map((char**)&raw_data_ptr, byte_size, buffer_offset);
-  raw_data->memory_ptr = buffer_offset;
+  // Restore the previous device before returning from the function.
+  ScopedSetDevice scoped_set_device(memory_type_id);
+  err = cudaIpcCloseMemHandle(data_ptr);
+  if (err != cudaSuccess) {
+    throw PythonBackendException(
+        std::string("Failed to close the cudaIpcHandle. error: ") +
+        cudaGetErrorString(err));
+  }
 }
 
-void
-SaveMapToSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t& shm_offset,
-    const std::unordered_map<std::string, std::string>& map)
+bool
+CUDAHandler::HasPrimaryContext(int device)
 {
-  Dict* dict;
-  shm_pool->Map((char**)&dict, sizeof(Dict), shm_offset);
-  dict->length = map.size();
-
-  Pair* pairs;
-  shm_pool->Map((char**)&pairs, sizeof(Pair) * map.size(), dict->values);
-
-  size_t i = 0;
-  for (const auto& pair : map) {
-    SaveStringToSharedMemory(shm_pool, pairs[i].key, pair.first.c_str());
-    SaveStringToSharedMemory(shm_pool, pairs[i].value, pair.second.c_str());
-    i += 1;
+  unsigned int ctx_flags;
+  int ctx_is_active = 0;
+  CUresult cuda_err = (*cu_device_primary_ctx_get_state_fn_)(
+      device, &ctx_flags, &ctx_is_active);
+  if (cuda_err != CUDA_SUCCESS) {
+    const char* error_string;
+    (*cu_get_error_string_fn_)(cuda_err, &error_string);
+    throw PythonBackendException(
+        std::string(
+            "failed to get primary context state: " + std::string(error_string))
+            .c_str());
   }
+
+  return ctx_is_active == 1;
 }
 
 void
-LoadMapFromSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t shm_offset,
-    std::unordered_map<std::string, std::string>& map)
+CUDAHandler::MaybeSetDevice(int device)
 {
-  Dict* dict;
-  shm_pool->MapOffset((char**)&dict, sizeof(Dict), shm_offset);
-
-  Pair* pairs;
-  shm_pool->MapOffset(
-      (char**)&pairs, sizeof(Pair) * dict->length, dict->values);
-  for (size_t i = 0; i < dict->length; i++) {
-    char* key;
-    LoadStringFromSharedMemory(shm_pool, pairs[i].key, key);
-
-    char* value;
-    LoadStringFromSharedMemory(shm_pool, pairs[i].value, value);
-    map.emplace(std::make_pair(key, value));
+  if (HasPrimaryContext(device)) {
+    cudaError_t err = cudaSetDevice(device);
+    if (err != cudaSuccess) {
+      throw PythonBackendException(
+          std::string("Failed to set the CUDA device to ") +
+          std::to_string(device) + ". error: " + cudaGetErrorString(err));
+    }
   }
 }
 
-void
-SaveTensorToSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, Tensor* tensor,
-    char*& raw_data_ptr, TRITONSERVER_MemoryType memory_type,
-    int memory_type_id, uint64_t byte_size, const char* name,
-    const int64_t* dims, size_t dims_count, TRITONSERVER_DataType dtype)
+
+CUDAHandler::~CUDAHandler() noexcept(false)
 {
-  // Raw Data
-  off_t raw_data_offset;
-  SaveRawDataToSharedMemory(
-      shm_pool, raw_data_offset, raw_data_ptr, memory_type, memory_type_id,
-      byte_size);
-  tensor->raw_data = raw_data_offset;
-
-  // name
-  off_t name_offset;
-  SaveStringToSharedMemory(shm_pool, name_offset, name);
-  tensor->name = name_offset;
-
-  // input dtype
-  tensor->dtype = dtype;
-
-  // input dims
-  int64_t* tensor_dims;
-  tensor->dims_count = dims_count;
-  off_t tensor_dims_offset;
-  shm_pool->Map(
-      (char**)&tensor_dims, sizeof(int64_t) * dims_count, tensor_dims_offset);
-  tensor->dims = tensor_dims_offset;
-
-  for (size_t j = 0; j < dims_count; ++j) {
-    tensor_dims[j] = dims[j];
+  if (dl_open_handle_ != nullptr) {
+    CloseLibrary();
   }
 }
 
-void
-CopySingleArchiveEntry(archive* input_archive, archive* output_archive)
+void*
+CUDAHandler::LoadSharedObject(const char* filename)
 {
-  const void* buff;
-  size_t size;
-#if ARCHIVE_VERSION_NUMBER >= 3000000
-  int64_t offset;
+#ifdef _WIN32
+  // NOTE: 'nvcuda.dll' is a placeholder library. Apparently, this should be the
+  // equivalent library for Windows, but need to verify.
+  return LoadLibraryA("nvcuda.dll");
 #else
-  off_t offset;
+  return dlopen("libcuda.so", RTLD_LAZY);
 #endif
+}
 
-  for (;;) {
-    int return_status;
-    return_status =
-        archive_read_data_block(input_archive, &buff, &size, &offset);
-    if (return_status == ARCHIVE_EOF)
-      break;
-    if (return_status != ARCHIVE_OK)
-      throw PythonBackendException(
-          "archive_read_data_block() failed with error code = " +
-          std::to_string(return_status));
+void*
+CUDAHandler::LocateSymbol(const char* symbol)
+{
+#ifdef _WIN32
+  return GetProcAddress(static_cast<HMODULE>(dl_open_handle_), symbol);
+#else
+  return dlsym(dl_open_handle_, symbol);
+#endif
+}
 
-    return_status =
-        archive_write_data_block(output_archive, buff, size, offset);
-    if (return_status != ARCHIVE_OK) {
-      throw PythonBackendException(
-          "archive_write_data_block() failed with error code = " +
-          std::to_string(return_status) + ", error message is " +
-          archive_error_string(output_archive));
-    }
-  }
+
+std::string
+CUDAHandler::LocateSymbolError()
+{
+#ifdef _WIN32
+  return std::to_string(GetLastError());
+#else
+  return dlerror();
+#endif
 }
 
 void
-ExtractTarFile(std::string& archive_path, std::string& dst_path)
+CUDAHandler::CloseLibrary()
 {
-  char current_directory[PATH_MAX];
-  if (getcwd(current_directory, PATH_MAX) == nullptr) {
-    throw PythonBackendException(
-        (std::string("Failed to get the current working directory. Error: ") +
-         std::strerror(errno)));
+  bool successful = true;
+#ifdef _WIN32
+  successful = (FreeLibrary(static_cast<HMODULE>(dl_open_handle_)) != 0);
+#else
+  successful = (dlclose(dl_open_handle_) == 0);
+#endif
+  if (!successful) {
+    throw PythonBackendException("Failed to close the cuda library handle.");
   }
-  if (chdir(dst_path.c_str()) == -1) {
-    throw PythonBackendException(
-        (std::string("Failed to change the directory to ") + dst_path +
-         " Error: " + std::strerror(errno))
-            .c_str());
+}
+
+
+ScopedSetDevice::ScopedSetDevice(int device)
+{
+  device_ = device;
+  THROW_IF_CUDA_ERROR(cudaGetDevice(&current_device_));
+
+  if (current_device_ != device_) {
+    THROW_IF_CUDA_ERROR(cudaSetDevice(device_));
   }
+}
 
-  struct archive_entry* entry;
-  int flags = ARCHIVE_EXTRACT_TIME;
+ScopedSetDevice::~ScopedSetDevice()
+{
+  if (current_device_ != device_) {
+    CUDAHandler& cuda_handler = CUDAHandler::getInstance();
+    cuda_handler.MaybeSetDevice(current_device_);
+  }
+}
+
+bool
+IsUsingCUDAPool(
+    std::unique_ptr<CUDAMemoryPoolManager>& cuda_pool, int64_t memory_type_id,
+    void* data)
+{
+  CUDAHandler& cuda_api = CUDAHandler::getInstance();
+  CUdeviceptr cuda_pool_address = 0;
+  cuda_api.PointerGetAttribute(
+      &cuda_pool_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+      reinterpret_cast<CUdeviceptr>(data));
+
+  return (
+      cuda_pool->CUDAPoolAddress(memory_type_id) ==
+      reinterpret_cast<void*>(cuda_pool_address));
+}
 
-  struct archive* input_archive = archive_read_new();
-  struct archive* output_archive = archive_write_disk_new();
-  archive_write_disk_set_options(output_archive, flags);
+#endif  // TRITON_ENABLE_GPU
 
-  archive_read_support_filter_gzip(input_archive);
-  archive_read_support_format_tar(input_archive);
+// FIXME: [DLIS-6078]: We should not need this function. However, some paths are
+// being retrieved from core that are not platform-agnostic.
+void
+SanitizePath(std::string& path)
+{
+  std::replace(path.begin(), path.end(), '/', '\\');
+}
 
-  if (archive_path.size() == 0) {
-    throw PythonBackendException("The archive path is empty.");
+#ifndef TRITON_PB_STUB
+std::shared_ptr<TRITONSERVER_Error*>
+WrapTritonErrorInSharedPtr(TRITONSERVER_Error* error)
+{
+  std::shared_ptr<TRITONSERVER_Error*> response_error(
+      new TRITONSERVER_Error*, [](TRITONSERVER_Error** error) {
+        if (error != nullptr && *error != nullptr) {
+          TRITONSERVER_ErrorDelete(*error);
+        }
+
+        if (error != nullptr) {
+          delete error;
+        }
+      });
+  *response_error = error;
+  return response_error;
+}
+#endif  // NOT TRITON_PB_STUB
+
+bool
+IsValidIdentifier(const std::string& input)
+{
+  // Check for invalid characters
+  if (input.empty() ||
+      input.find_first_of(INVALID_CHARS) != std::string::npos) {
+    return false;
   }
 
-  THROW_IF_ERROR(
-      "archive_read_open_filename() failed.",
-      archive_read_open_filename(
-          input_archive, archive_path.c_str(), 10240 /* block_size */));
+  return true;
+}
 
-  while (true) {
-    int read_status = archive_read_next_header(input_archive, &entry);
-    if (read_status == ARCHIVE_EOF)
-      break;
-    if (read_status != ARCHIVE_OK) {
-      throw PythonBackendException(
-          std::string("archive_read_next_header() failed with error code = ") +
-          std::to_string(read_status) + std::string(" error message is ") +
-          archive_error_string(input_archive));
-    }
+bool
+IsExecutableFile(const std::string& filepath)
+{
+  struct stat file_stat;
+  if (stat(filepath.c_str(), &file_stat) != 0) {
+    return false;
+  }
 
-    read_status = archive_write_header(output_archive, entry);
-    if (read_status != ARCHIVE_OK) {
-      throw PythonBackendException(std::string(
-          "archive_write_header() failed with error code = " +
-          std::to_string(read_status) + std::string(" error message is ") +
-          archive_error_string(output_archive)));
-    }
+  // Check if it's a regular file and executable by owner
+  return S_ISREG(file_stat.st_mode) && (file_stat.st_mode & S_IXUSR);
+}
 
-    CopySingleArchiveEntry(input_archive, output_archive);
+std::string
+GenerateUUID()
+{
+  static boost::uuids::random_generator generator;
+  boost::uuids::uuid uuid = generator();
+  return boost::uuids::to_string(uuid);
+}
 
-    read_status = archive_write_finish_entry(output_archive);
-    if (read_status != ARCHIVE_OK) {
-      throw PythonBackendException(std::string(
-          "archive_write_finish_entry() failed with error code = " +
-          std::to_string(read_status) + std::string(" error message is ") +
-          archive_error_string(output_archive)));
+// Helper function to get environment variables for Python virtual environments
+std::map<std::string, std::string>
+ParseActivationScript(const std::string& activate_path)
+{
+  std::map<std::string, std::string> env_vars;
+
+  // Read the current environment as baseline
+#ifndef _WIN32
+  if (environ != nullptr) {
+    for (char** env = environ; *env != nullptr; env++) {
+      std::string env_str(*env);
+      size_t eq_pos = env_str.find('=');
+      if (eq_pos != std::string::npos) {
+        std::string key = env_str.substr(0, eq_pos);
+        std::string value = env_str.substr(eq_pos + 1);
+        env_vars[key] = value;
+      }
     }
   }
+#endif
 
-  archive_read_close(input_archive);
-  archive_read_free(input_archive);
+  // Extract virtual environment root from activation script path
+  std::string venv_path = activate_path;
+  size_t bin_activate_pos = venv_path.find("/bin/activate");
+  if (bin_activate_pos != std::string::npos) {
+    venv_path = venv_path.substr(0, bin_activate_pos);
+  }
 
-  archive_write_close(output_archive);
-  archive_write_free(output_archive);
+  // Set standard virtual environment variables
+  env_vars["VIRTUAL_ENV"] = venv_path;
+  env_vars["VIRTUAL_ENV_PROMPT"] = "(" + venv_path + ")";
 
-  // Revert the directory change.
-  if (chdir(current_directory) == -1) {
-    throw PythonBackendException(
-        (std::string("Failed to change the directory to ") + current_directory)
-            .c_str());
+  // Update PATH to include the virtual environment's bin directory
+  std::string new_path = venv_path + "/bin";
+  if (env_vars.find("PATH") != env_vars.end()) {
+    new_path += ":" + env_vars["PATH"];
   }
+  env_vars["PATH"] = new_path;
+
+  // Update LD_LIBRARY_PATH to include the virtual environment's lib directory
+  std::string new_lib_path = venv_path + "/lib";
+  if (env_vars.find("LD_LIBRARY_PATH") != env_vars.end()) {
+    new_lib_path += ":" + env_vars["LD_LIBRARY_PATH"];
+  }
+  env_vars["LD_LIBRARY_PATH"] = new_lib_path;
+
+  // Remove PYTHONHOME if it exists
+  env_vars.erase("PYTHONHOME");
+
+  return env_vars;
 }
 
-bool
-FileExists(std::string& path)
+// Helper function to prepare environment array for execve
+std::pair<std::vector<std::string>, std::vector<char*>>
+PrepareEnvironment(
+    const std::map<std::string, std::string>& env_vars,
+    const std::string& additional_lib_path)
 {
-  struct stat buffer;
-  return stat(path.c_str(), &buffer) == 0;
+  std::vector<std::string> env_strings;
+  std::vector<char*> env_array;
+
+  for (const auto& [key, value] : env_vars) {
+    std::string env_string;
+    if (key == "LD_LIBRARY_PATH" && !additional_lib_path.empty()) {
+      // Prepend the additional library path
+      env_string = key + "=" + additional_lib_path + ":" + value;
+    } else {
+      env_string = key + "=" + value;
+    }
+    env_strings.push_back(env_string);
+  }
+
+  // Convert to char* array
+  for (auto& env_str : env_strings) {
+    env_array.push_back(const_cast<char*>(env_str.c_str()));
+  }
+  env_array.push_back(nullptr);
+
+  return std::make_pair(std::move(env_strings), std::move(env_array));
 }
 
 }}}  // namespace triton::backend::python
diff --git a/src/pb_utils.h b/src/pb_utils.h
index 09a94642..fa315210 100644
--- a/src/pb_utils.h
+++ b/src/pb_utils.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -26,13 +26,25 @@
 
 #pragma once
 
-#include <pthread.h>
+#ifdef TRITON_ENABLE_GPU
+#include <cuda.h>
+#endif  // TRITON_ENABLE_GPU
+
+#include <boost/interprocess/sync/interprocess_condition.hpp>
+#include <boost/interprocess/sync/interprocess_mutex.hpp>
+#include <boost/uuid/uuid.hpp>
+#include <boost/uuid/uuid_generators.hpp>
+#include <boost/uuid/uuid_io.hpp>
 #include <climits>
-#include <exception>
+#include <map>
 #include <memory>
+#include <mutex>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
+
+#include "pb_exception.h"
 #include "shm_manager.h"
 #include "triton/backend/backend_common.h"
 #include "triton/core/tritonserver.h"
@@ -47,12 +59,12 @@ namespace bi = boost::interprocess;
       (X);                                                         \
     }                                                              \
     catch (cont PythonBackendException & pb_exception) {           \
-      off_t string_offset__;                                       \
+      bi::managed_external_buffer::handle_t string_handle__;       \
       try {                                                        \
         SaveStringToSharedMemory(                                  \
-            SHM_POOL, string_offset__, pb_exception.what());       \
+            SHM_POOL, string_handle__, pb_exception.what());       \
         RESPONSE->has_error = true;                                \
-        RESPONSE->error = string_offset__;                         \
+        RESPONSE->error = string_handle__;                         \
         if (R)                                                     \
           return;                                                  \
       }                                                            \
@@ -63,134 +75,296 @@ namespace bi = boost::interprocess;
             TRITONSERVER_ErrorMessage(pb2_exception.what()));      \
       }                                                            \
     }                                                              \
-    while (false)
+  } while (false)
 
-//
-// Represents a raw data
-//
-struct RawData {
-  off_t memory_ptr;
-  TRITONSERVER_MemoryType memory_type;
-  int memory_type_id;
-  uint64_t byte_size;
+#define THROW_IF_TRITON_ERROR(X)                                              \
+  do {                                                                        \
+    TRITONSERVER_Error* tie_err__ = (X);                                      \
+    if (tie_err__ != nullptr) {                                               \
+      auto error_message = std::string(TRITONSERVER_ErrorMessage(tie_err__)); \
+      TRITONSERVER_ErrorDelete(tie_err__);                                    \
+      throw PythonBackendException(error_message);                            \
+    }                                                                         \
+  } while (false)
+
+#define THROW_IF_CUDA_ERROR(X)                          \
+  do {                                                  \
+    cudaError_t cuda_err__ = (X);                       \
+    if (cuda_err__ != cudaSuccess) {                    \
+      throw PythonBackendException(                     \
+          std::string(cudaGetErrorString(cuda_err__))); \
+    }                                                   \
+  } while (false)
+
+#define THROW_IF_ERROR(MSG, X)           \
+  do {                                   \
+    int return__ = (X);                  \
+    if (return__ != 0) {                 \
+      throw PythonBackendException(MSG); \
+    }                                    \
+  } while (false)
+
+
+#define DUMMY_MESSAGE 0
+#define DISALLOW_COPY(TypeName) TypeName(const TypeName&) = delete;
+#define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&) = delete;
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  DISALLOW_COPY(TypeName)                  \
+  DISALLOW_ASSIGN(TypeName)
+
+struct InitializeResponseShm {
+  // Indicates whether the response has an error or not.
+  bool response_has_error;
+  // Indicates whether the response error is set or not.
+  bool response_is_error_set;
+  // Contains the error message.
+  bi::managed_external_buffer::handle_t response_error;
 };
 
-//
-// Represents a Tensor object that will be passed to Python code.
-//
-struct Tensor {
-  off_t raw_data;  // Offset for raw data field.
-  off_t name;      // Offset for name field.
-  TRITONSERVER_DataType dtype;
-  off_t dims;  // Shared memory offset for the dimensions.
-  size_t dims_count;
+struct AutoCompleteResponseShm {
+  // Indicates whether the response has an error or not.
+  bool response_has_error;
+  // Indicates whether the response error is set or not.
+  bool response_is_error_set;
+  // Contains the error message.
+  bi::managed_external_buffer::handle_t response_error;
+  // Indicates whether the response has model config or not.
+  bool response_has_model_config;
+  // Contains the model config
+  bi::managed_external_buffer::handle_t response_model_config;
 };
 
-struct String {
-  off_t data;
-  size_t length;
+// Control data structure for the communication between the Python stub and the
+// main stub.
+struct IPCControlShm {
+  bool stub_health;
+  bool parent_health;
+  bool uses_env;
+  bool decoupled;
+  bi::interprocess_mutex parent_health_mutex;
+  bi::interprocess_mutex stub_health_mutex;
+  bi::managed_external_buffer::handle_t stub_message_queue;
+  bi::managed_external_buffer::handle_t parent_message_queue;
+  bi::managed_external_buffer::handle_t stub_to_parent_mq;
+  bi::managed_external_buffer::handle_t parent_to_stub_mq;
+  bi::managed_external_buffer::handle_t memory_manager_message_queue;
 };
 
-//
-// Inference Request
-//
-struct Request {
-  off_t id;  // Offset for the id field.
-  uint64_t correlation_id;
-  off_t inputs;  // Offset for input field.
-  uint32_t requested_input_count;
-  off_t requested_output_names;  // Offset for the requested output names
-  uint32_t requested_output_count;
-};
-
-struct Response {
-  off_t outputs;  // Offset for Tensor output.
-  uint32_t outputs_size;
-  off_t error;
-  bool has_error;
-  bool is_error_set;  // Indicates whether this error has a message or not.
+struct SendMessageBase {
+  bi::interprocess_mutex mu;
+  bi::interprocess_condition cv;
+  bool waiting_on_stub;
 };
 
-struct ResponseBatch {
-  off_t responses;  // Offset for response object.
+struct ResponseBatch : SendMessageBase {
   uint32_t batch_size;
-  off_t error;
+  bi::managed_external_buffer::handle_t error;
+  bool has_error;
+
+  // Indicates whether an additional call to stub is required for the clean up
+  // of the resources.
+  bool cleanup;
+
+  // Indicates whether this error has a message or not.
+  bool is_error_set;
+
+  uint32_t response_size;
+
+  // Indicates whether the response factory has been deleted or not.
+  bool is_response_factory_deleted = false;
+};
+
+enum LogLevel { kInfo = 0, kWarning, kError, kVerbose };
+
+enum MetricKind { kCounter = 0, kGauge, kHistogram };
+
+struct LogSendMessage : SendMessageBase {
+  bi::managed_external_buffer::handle_t filename;
+  int32_t line;
+  bi::managed_external_buffer::handle_t log_message;
+  LogLevel level;
+};
+
+struct CleanupMessage : SendMessageBase {
+  void* id;
+};
+
+struct CancelBLSRequestMessage : SendMessageBase {
+  void* infer_payload_id;
+  bool is_cancelled;
+};
+
+struct IsCancelledMessage : SendMessageBase {
+  intptr_t response_factory_address;
+  intptr_t request_address;
+  bool is_cancelled;
+};
+
+struct CustomMetricsMessage : SendMessageBase {
+  bi::managed_external_buffer::handle_t message;
+  bool has_error;
+  bool is_error_set;
+  bi::managed_external_buffer::handle_t error;
+  // This field is specifically utilized when making the
+  // 'PYTHONSTUB_MetricRequestValue' request. It is used to hold the metric
+  // value after the Python backend calls the Triton C API to retrieve the
+  // metric value and pass it back to the stub process.
+  double value;
+  // This field is specifically utilized when making the
+  // 'PYTHONSTUB_MetricFamilyRequestNew' or 'PYTHONSTUB_MetricRequestNew'
+  // requests. It is used to hold the memory address of
+  // TRITONSERVER_MetricFamily' or 'TRITONSERVER_Metric' objects created in the
+  // Python backend and pass back to the stub process.
+  void* address;
+};
+
+struct ModelLoaderMessage : SendMessageBase {
+  bi::managed_external_buffer::handle_t message;
   bool has_error;
-  bool is_error_set;  // Indicates whether this error has a message or not.
+  bool is_error_set;
+  bi::managed_external_buffer::handle_t error;
+  bool is_model_ready;
+};
+
+struct ResponseSenderBase {
+  bi::interprocess_mutex mu;
+  bi::interprocess_condition cv;
+  bool is_stub_turn;
+  bool has_error;
+  bool is_error_set;
+  bi::managed_external_buffer::handle_t error;
+  intptr_t request_address;
+  intptr_t response_factory_address;
+};
+
+struct ResponseSendMessage : ResponseSenderBase {
+  bi::managed_external_buffer::handle_t response;
+
+  // A shm handle to a GPUBuffersShm object.
+  bi::managed_external_buffer::handle_t gpu_buffers_handle;
+
+  uint32_t flags;
 };
 
 struct RequestBatch {
-  off_t requests;  // Offset for request object.
   uint32_t batch_size;
-};
 
-struct IPCMessage {
-  // request points to a RequestBatch struct.
-  off_t request_batch;
+  // A shm handle to a GPUBuffersShm object.
+  bi::managed_external_buffer::handle_t gpu_buffers_handle;
+};
 
-  // response points to a ResponseBatch struct.
-  off_t response_batch;
-  bool health;
+struct MemoryReleaseMessage {
+  std::mutex mu;
+  std::condition_variable cv;
+  uint64_t id;
+  bool waiting_on_stub;
 };
 
-// Representing a key value pair
-struct Pair {
-  off_t key;
-  off_t value;
+#ifdef TRITON_ENABLE_GPU
+struct CUDAMemPoolMessage : SendMessageBase {
+  cudaIpcMemHandle_t cuda_handle;
+  int32_t device_id;
+  bi::managed_external_buffer::handle_t error;
+  bool has_error;
+  bool is_error_set;
 };
 
-struct Dict {
-  uint32_t length;
-  // Values point to the location where there are `length` pairs.
-  off_t values;
+class CUDAHandler {
+ public:
+  static CUDAHandler& getInstance()
+  {
+    static CUDAHandler instance;
+    return instance;
+  }
+
+ private:
+  std::mutex mu_;
+  void* dl_open_handle_ = nullptr;
+  std::string error_str_;
+  CUresult (*cu_pointer_get_attribute_fn_)(
+      CUdeviceptr*, CUpointer_attribute, CUdeviceptr) = nullptr;
+  CUresult (*cu_get_error_string_fn_)(CUresult, const char**) = nullptr;
+  CUresult (*cu_init_fn_)(unsigned int) = nullptr;
+  CUresult (*cu_device_primary_ctx_get_state_fn_)(
+      CUdevice, unsigned int*, int*) = nullptr;
+  CUDAHandler();
+
+  /// Check if a primary context has already been created for a device.
+  bool HasPrimaryContext(int device);
+  ~CUDAHandler() noexcept(false);
+
+ public:
+  CUDAHandler(CUDAHandler const&) = delete;
+  void operator=(CUDAHandler const&) = delete;
+  bool IsAvailable();
+  const std::string& GetErrorString() const { return error_str_; }
+  void ClearErrorString() { return error_str_.clear(); }
+  void PointerGetAttribute(
+      CUdeviceptr* start_address, CUpointer_attribute attr,
+      CUdeviceptr device_ptr);
+  void OpenCudaHandle(
+      int64_t memory_type_id, cudaIpcMemHandle_t* cuda_mem_handle,
+      void** data_ptr);
+  void CloseCudaHandle(int64_t memory_type_id, void* data_ptr);
+  void* LoadSharedObject(const char* filename);
+  void* LocateSymbol(const char* symbol);
+  std::string LocateSymbolError();
+  void CloseLibrary();
+
+  /// Set the device only if the primary context has already been created for
+  /// this device. Inspired from PyTorch's MaybeSetDevice.
+  /// \param device The cuda device index.
+  void MaybeSetDevice(int device);
 };
 
-//
-// PythonBackendException
-//
-// Exception thrown if error occurs in PythonBackend.
-//
-struct PythonBackendException : std::exception {
-  PythonBackendException(std::string message) : message_(message) {}
 
-  const char* what() const throw() { return message_.c_str(); }
+/// A helper class to change the current device and restore the old context. The
+/// old context will be restored only if the primary context for that device is
+/// already created, otherwise the CUDA context will remain as the primary
+/// context of 'device'.
+class ScopedSetDevice {
+ public:
+  ScopedSetDevice(int device);
+  ~ScopedSetDevice();
 
-  std::string message_;
+ private:
+  int device_;
+  int current_device_;
 };
 
-void SaveMapToSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t& shm_offset,
-    const std::unordered_map<std::string, std::string>& map);
+// Check if the data is allocated from the pool by the base address.
+bool IsUsingCUDAPool(
+    std::unique_ptr<CUDAMemoryPoolManager>& cuda_pool, int64_t memory_type_id,
+    void* data);
+
+#endif  // TRITON_ENABLE_GPU
+
+// FIXME: [DLIS-6078]: We should not need this function. However, some paths are
+// being retrieved from core that are not platform-agnostic.
+void SanitizePath(std::string& path);
+
+// Invalid characters that are not allowed in user input
+constexpr const char* INVALID_CHARS = ";|&$`<>()[]{}\\\"'*?~#!";
 
-void LoadMapFromSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t shm_offset,
-    std::unordered_map<std::string, std::string>& map);
+// Validate that an identifier (model name, region name, etc.)
+bool IsValidIdentifier(const std::string& input);
 
-void SaveStringToSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t& shm_offset,
-    const char* str);
-void LoadStringFromSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t shm_offset, char*& str);
+// Check if a file exists and is executable
+bool IsExecutableFile(const std::string& filepath);
 
-void LoadRawDataFromSharedLibrary(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t& tensor_shm_offset,
-    const Tensor& tensor);
-void SaveRawDataToSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t& raw_data_offset,
-    char*& raw_data_ptr, TRITONSERVER_MemoryType memory_type,
-    int memory_type_id, uint64_t byte_size);
+#ifndef TRITON_PB_STUB
+std::shared_ptr<TRITONSERVER_Error*> WrapTritonErrorInSharedPtr(
+    TRITONSERVER_Error* error);
+#endif
 
-void SaveTensorToSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, Tensor* tensor,
-    char*& raw_data_ptr, TRITONSERVER_MemoryType memory_type,
-    int memory_type_id, uint64_t byte_size, const char* name,
-    const int64_t* dims, size_t dims_count, TRITONSERVER_DataType dtype);
-void LoadTensorFromSharedMemory(
-    std::unique_ptr<SharedMemory>& shm_pool, off_t tensor_shm_offset,
-    Tensor& tensor);
+std::string GenerateUUID();
 
-void ExtractTarFile(std::string& archive_path, std::string& dst_path);
+// Environment handling utilities for Python activation scripts
+std::map<std::string, std::string> ParseActivationScript(
+    const std::string& activate_path);
 
-bool FileExists(std::string& path);
+std::pair<std::vector<std::string>, std::vector<char*>> PrepareEnvironment(
+    const std::map<std::string, std::string>& env_vars,
+    const std::string& additional_lib_path = "");
 
 }}}  // namespace triton::backend::python
diff --git a/src/python.cc b/src/python.cc
deleted file mode 100644
index 0284c422..00000000
--- a/src/python.cc
+++ /dev/null
@@ -1,1560 +0,0 @@
-// Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <pthread.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/vfs.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <atomic>
-#include <boost/interprocess/sync/interprocess_condition.hpp>
-#include <boost/interprocess/sync/interprocess_mutex.hpp>
-#include <boost/interprocess/sync/scoped_lock.hpp>
-#include <boost/thread/thread_time.hpp>
-#include <chrono>
-#include <csignal>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <functional>
-#include <memory>
-#include <numeric>
-#include <sstream>
-#include <string>
-#include <thread>
-#include <vector>
-#include "pb_env.h"
-#include "pb_utils.h"
-#include "shm_manager.h"
-#include "triton/backend/backend_common.h"
-#include "triton/backend/backend_input_collector.h"
-#include "triton/backend/backend_memory.h"
-#include "triton/backend/backend_model.h"
-#include "triton/backend/backend_model_instance.h"
-#include "triton/common/triton_json.h"
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-
-
-#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \
-  do {                                                                 \
-    TRITONSERVER_Error* raarie_err__ = (X);                            \
-    if (raarie_err__ != nullptr) {                                     \
-      SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__); \
-      return nullptr;                                                  \
-    }                                                                  \
-  } while (false)
-
-#define RESPOND_ALL_AND_RETURN_IF_EXCEPTION(RESPONSES, RESPONSES_COUNT, X) \
-  do {                                                                     \
-    try {                                                                  \
-      (X);                                                                 \
-    }                                                                      \
-    catch (const PythonBackendException& exception) {                      \
-      TRITONSERVER_Error* raarie_err__ = TRITONSERVER_ErrorNew(            \
-          TRITONSERVER_ERROR_INTERNAL, exception.what());                  \
-      SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__);     \
-      return nullptr;                                                      \
-    }                                                                      \
-  } while (false)
-
-#define RESPOND_AND_RETURN_IF_ERROR(REQUEST, X)                         \
-  do {                                                                  \
-    TRITONSERVER_Error* rarie_err__ = (X);                              \
-    if (rarie_err__ != nullptr) {                                       \
-      TRITONBACKEND_Response* rarie_response__ = nullptr;               \
-      LOG_IF_ERROR(                                                     \
-          TRITONBACKEND_ResponseNew(&rarie_response__, REQUEST),        \
-          "failed to create response");                                 \
-      if (rarie_response__ != nullptr) {                                \
-        LOG_IF_ERROR(                                                   \
-            TRITONBACKEND_ResponseSend(                                 \
-                rarie_response__, TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
-                rarie_err__),                                           \
-            "failed to send error response");                           \
-      }                                                                 \
-      return rarie_err__;                                               \
-    }                                                                   \
-  } while (false)
-
-#define RESPOND_AND_RETURN_IF_EXCEPTION(REQUEST, X)                     \
-  do {                                                                  \
-    try {                                                               \
-      (X);                                                              \
-    }                                                                   \
-    catch (const PythonBackendException& exception) {                   \
-      TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew(          \
-          TRITONSERVER_ERROR_INTERNAL, exception.what());               \
-      TRITONBACKEND_Response* rarie_response__ = nullptr;               \
-      LOG_IF_ERROR(                                                     \
-          TRITONBACKEND_ResponseNew(&rarie_response__, REQUEST),        \
-          "failed to create response");                                 \
-      if (rarie_response__ != nullptr) {                                \
-        LOG_IF_ERROR(                                                   \
-            TRITONBACKEND_ResponseSend(                                 \
-                rarie_response__, TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
-                rarie_err__),                                           \
-            "failed to send error response");                           \
-      }                                                                 \
-      return rarie_err__;                                               \
-    }                                                                   \
-  } while (false)
-
-#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X)                     \
-  do {                                                                  \
-    if ((RESPONSES)[IDX] != nullptr) {                                  \
-      TRITONSERVER_Error* err__ = (X);                                  \
-      if (err__ != nullptr) {                                           \
-        LOG_IF_ERROR(                                                   \
-            TRITONBACKEND_ResponseSend(                                 \
-                (RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
-                err__),                                                 \
-            "failed to send error response");                           \
-        (RESPONSES)[IDX] = nullptr;                                     \
-        TRITONSERVER_ErrorDelete(err__);                                \
-      }                                                                 \
-    }                                                                   \
-  } while (false)
-
-#define GUARDED_RESPOND_IF_EXCEPTION(RESPONSES, IDX, X)                 \
-  do {                                                                  \
-    if ((RESPONSES)[IDX] != nullptr) {                                  \
-      try {                                                             \
-        (X);                                                            \
-      }                                                                 \
-      catch (const PythonBackendException& pb_exception) {              \
-        TRITONSERVER_Error* err__ = TRITONSERVER_ErrorNew(              \
-            TRITONSERVER_ERROR_INTERNAL, pb_exception.what());          \
-        LOG_IF_ERROR(                                                   \
-            TRITONBACKEND_ResponseSend(                                 \
-                (RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
-                err__),                                                 \
-            "failed to send error response");                           \
-        (RESPONSES)[IDX] = nullptr;                                     \
-        TRITONSERVER_ErrorDelete(err__);                                \
-      }                                                                 \
-    }                                                                   \
-  } while (false)
-
-#define RETURN_IF_EXCEPTION(X)                                 \
-  do {                                                         \
-    try {                                                      \
-      (X);                                                     \
-    }                                                          \
-    catch (const PythonBackendException& pb_exception) {       \
-      TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \
-          TRITONSERVER_ERROR_INTERNAL, pb_exception.what());   \
-      return rarie_err__;                                      \
-    }                                                          \
-  } while (false)
-
-namespace triton { namespace backend { namespace python {
-
-namespace bi = boost::interprocess;
-
-struct BackendState {
-  std::string python_lib;
-  int64_t shm_default_byte_size;
-  int64_t shm_growth_byte_size;
-  int64_t stub_timeout_seconds;
-  std::unique_ptr<EnvironmentManager> env_manager;
-};
-
-class ModelState : public BackendModel {
- public:
-  static TRITONSERVER_Error* Create(
-      TRITONBACKEND_Model* triton_model, ModelState** state);
-
-  // Get backend state
-  BackendState* StateForBackend() { return backend_state_; }
-
-  // Get the Python execution environment
-  std::string PythonExecutionEnv() { return python_execution_env_; }
-
- private:
-  ModelState(TRITONBACKEND_Model* triton_model);
-  BackendState* backend_state_;
-  std::string python_execution_env_;
-};
-
-TRITONSERVER_Error*
-CreateTritonErrorFromException(const PythonBackendException& pb_exception)
-{
-  return TRITONSERVER_ErrorNew(
-      TRITONSERVER_ERROR_INTERNAL, pb_exception.what());
-}
-
-class ModelInstanceState : public BackendModelInstance {
-  ModelInstanceState(
-      ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance);
-
-  TRITONBACKEND_Model* triton_model_;
-  bi::interprocess_mutex* stub_mutex_;
-  bi::interprocess_condition* stub_cond_;
-  bi::interprocess_mutex* parent_mutex_;
-  bi::interprocess_condition* parent_cond_;
-  bi::interprocess_mutex* health_mutex_;
-  std::unique_ptr<bi::scoped_lock<bi::interprocess_mutex>> parent_lock_;
-  std::string model_path_;
-  IPCMessage* ipc_message_;
-  std::unique_ptr<SharedMemory> shm_pool_;
-
-  // Stub process pid
-  pid_t stub_pid_;
-
-  // Parent process pid
-  pid_t parent_pid_;
-  bool initialized_;
-
-  // Path to python execution environment
-  std::string path_to_libpython_;
-  std::string path_to_activate_;
-
- public:
-  static TRITONSERVER_Error* Create(
-      ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance,
-      ModelInstanceState** model_instance_state);
-
-  ~ModelInstanceState();
-
-  // Load Triton inputs to the appropriate Protobufs
-  TRITONSERVER_Error* GetInputTensor(
-      const uint32_t input_idx, Tensor* input_tensor,
-      TRITONBACKEND_Request* request,
-      std::vector<TRITONBACKEND_Response*>& responses);
-
-  TRITONSERVER_Error* ProcessRequests(
-      TRITONBACKEND_Request** requests, const uint32_t request_count);
-
-  // Create the stub process.
-  TRITONSERVER_Error* SetupStubProcess();
-
-  // Notifies the stub process on the new request.  Returns false if the parent
-  // process fails to acquire the lock.
-  bool NotifyStub();
-
-  // Checks whether the stub process is live
-  bool IsStubProcessAlive();
-
-  // Wait for stub notification
-  bool WaitForStubNotification();
-
-  // Responds to all the requests with an error message.
-  void RespondErrorToAllRequests(
-      const char* message, std::vector<TRITONBACKEND_Response*>& responses,
-      TRITONBACKEND_Request** requests, const uint32_t request_count);
-
-  // Kill stub process
-  void KillStubProcess();
-
-  // Start stub process
-  TRITONSERVER_Error* StartStubProcess();
-};
-
-ModelInstanceState::ModelInstanceState(
-    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
-    : BackendModelInstance(model_state, triton_model_instance), stub_pid_(0),
-      initialized_(false)
-{
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::Create(
-    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
-    ModelInstanceState** state)
-{
-  try {
-    *state = new ModelInstanceState(model_state, triton_model_instance);
-  }
-  catch (const BackendModelInstanceException& ex) {
-    RETURN_ERROR_IF_TRUE(
-        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
-        std::string("unexpected nullptr in BackendModelInstanceException"));
-    RETURN_IF_ERROR(ex.err_);
-  }
-  return nullptr;  // success
-}
-
-bool
-ModelInstanceState::NotifyStub()
-{
-  boost::posix_time::ptime timeout =
-      boost::get_system_time() + boost::posix_time::milliseconds(1000);
-  bi::scoped_lock<bi::interprocess_mutex> lock(*stub_mutex_, timeout);
-
-  if (lock) {
-    stub_cond_->notify_one();
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void
-ModelInstanceState::KillStubProcess()
-{
-  kill(stub_pid_, SIGKILL);
-  int status;
-  waitpid(stub_pid_, &status, 0);
-  stub_pid_ = 0;
-}
-
-bool
-ModelInstanceState::WaitForStubNotification()
-{
-  uint64_t timeout_seceonds = 1000;
-  boost::posix_time::ptime timeout =
-      boost::get_system_time() +
-      boost::posix_time::milliseconds(timeout_seceonds);
-
-  {
-    bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_, timeout);
-
-    // Check if lock has been acquired.
-    if (lock) {
-      ipc_message_->health = false;
-    } else {
-      // If It failed to obtain the lock, it means that the stub has been
-      // stuck or exited while holding the health mutex lock.
-      return false;
-    }
-  }
-
-  timeout = boost::get_system_time() +
-            boost::posix_time::milliseconds(timeout_seceonds);
-  while (!parent_cond_->timed_wait(*parent_lock_, timeout)) {
-    if (!IsStubProcessAlive()) {
-      return false;
-    }
-
-    timeout = boost::get_system_time() +
-              boost::posix_time::milliseconds(timeout_seceonds);
-  }
-  return true;
-}
-
-void
-ModelInstanceState::RespondErrorToAllRequests(
-    const char* message, std::vector<TRITONBACKEND_Response*>& responses,
-    TRITONBACKEND_Request** requests, const uint32_t request_count)
-{
-  for (uint32_t r = 0; r < request_count; ++r) {
-    if (responses[r] == nullptr)
-      continue;
-
-    TRITONSERVER_Error* err = TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        (std::string("Failed to process the request(s), message: ") + message)
-            .c_str());
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO, "Failed to process the batch of requests.");
-    LOG_IF_ERROR(
-        TRITONBACKEND_ResponseSend(
-            responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
-        "failed sending response");
-
-    responses[r] = nullptr;
-    TRITONSERVER_ErrorDelete(err);
-  }
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::ProcessRequests(
-    TRITONBACKEND_Request** requests, const uint32_t request_count)
-{
-  ModelState* model_state = reinterpret_cast<ModelState*>(Model());
-  int max_batch_size = model_state->MaxBatchSize();
-  std::string name = model_state->Name();
-
-  // For each request collect the total batch size for this inference
-  // execution. The batch-size, number of inputs, and size of each
-  // input has already been checked so don't need to do that here.
-
-  size_t total_batch_size = 0;
-  for (size_t i = 0; i < request_count; i++) {
-    // If we get a nullptr request then something is badly wrong. Fail
-    // and release all requests.
-    if (requests[i] == nullptr) {
-      RequestsRespondWithError(
-          requests, request_count,
-          TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INTERNAL,
-              std::string(
-                  "null request given to Python backend for '" + name + "'")
-                  .c_str()));
-      return nullptr;
-    }
-
-    if (max_batch_size > 0) {
-      // Retrieve the batch size from one of the inputs, if the model
-      // supports batching, the first dimension size is batch size
-      TRITONBACKEND_Input* input;
-      TRITONSERVER_Error* err =
-          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
-      if (err == nullptr) {
-        const int64_t* shape;
-        err = TRITONBACKEND_InputProperties(
-            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
-        total_batch_size += shape[0];
-      }
-      if (err != nullptr) {
-        RequestsRespondWithError(requests, request_count, err);
-        return nullptr;
-      }
-    } else {
-      total_batch_size += 1;
-    }
-  }
-
-  // If there are no valid payloads then no need to run the inference.
-  if (total_batch_size == 0) {
-    return nullptr;
-  }
-
-  // Make sure the maximum batch size is not exceeded. The
-  // total_batch_size must be 1 for models that don't support batching
-  // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
-  // scheduler has done something badly wrong so fail and release all
-  // requests.
-  if ((total_batch_size != 1) && (total_batch_size > (size_t)max_batch_size)) {
-    RequestsRespondWithError(
-        requests, request_count,
-        TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            std::string(
-                "batch size " + std::to_string(total_batch_size) + " for '" +
-                name + "', max allowed is " + std::to_string(max_batch_size))
-                .c_str()));
-    return nullptr;
-  }
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("model ") + model_state->Name() + ", instance " + Name() +
-       ", executing " + std::to_string(request_count) + " requests")
-          .c_str());
-  uint64_t exec_start_ns = 0;
-  SET_TIMESTAMP(exec_start_ns);
-
-  // Create Python inference requests
-  RequestBatch* request_batch;
-  off_t request_batch_offset;
-  RETURN_IF_EXCEPTION(shm_pool_->Map(
-      (char**)&request_batch, sizeof(RequestBatch), request_batch_offset));
-
-  ipc_message_->request_batch = request_batch_offset;
-  request_batch->batch_size = request_count;
-
-  Request* requests_shm;
-  off_t requests_shm_offset;
-  RETURN_IF_EXCEPTION(shm_pool_->Map(
-      (char**)&requests_shm, sizeof(Request) * request_count,
-      requests_shm_offset));
-  request_batch->requests = requests_shm_offset;
-
-  // We take the responsibilty of the responses.
-  std::vector<TRITONBACKEND_Response*> responses;
-  responses.reserve(request_count);
-
-  for (size_t i = 0; i < request_count; i++) {
-    TRITONBACKEND_Response* response;
-    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
-    if (err == nullptr) {
-      responses.emplace_back(response);
-    } else {
-      responses.emplace_back(nullptr);
-      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response.");
-      TRITONSERVER_ErrorDelete(err);
-    }
-  }
-
-  for (uint32_t r = 0; r < request_count; ++r) {
-    TRITONBACKEND_Request* request = requests[r];
-    Request* python_infer_request = &requests_shm[r];
-    uint32_t requested_input_count = 0;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        &responses, request_count,
-        TRITONBACKEND_RequestInputCount(request, &requested_input_count));
-
-    python_infer_request->requested_input_count = requested_input_count;
-
-    uint32_t requested_output_count = 0;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        &responses, request_count,
-        TRITONBACKEND_RequestOutputCount(request, &requested_output_count));
-    python_infer_request->requested_output_count = requested_output_count;
-
-    Tensor* input_tensors;
-    off_t input_tensors_offset;
-
-    RESPOND_ALL_AND_RETURN_IF_EXCEPTION(
-        &responses, request_count,
-        shm_pool_->Map(
-            (char**)&input_tensors, sizeof(Tensor) * requested_input_count,
-            input_tensors_offset));
-    python_infer_request->inputs = input_tensors_offset;
-
-    for (size_t iidx = 0; iidx < requested_input_count; ++iidx) {
-      Tensor* input_tensor = &input_tensors[iidx];
-
-      RESPOND_ALL_AND_RETURN_IF_ERROR(
-          &responses, request_count,
-          GetInputTensor(iidx, input_tensor, request, responses));
-    }
-
-    off_t* requested_output_names;
-    off_t requested_output_names_offset;
-
-    RESPOND_ALL_AND_RETURN_IF_EXCEPTION(
-        &responses, request_count,
-        shm_pool_->Map(
-            (char**)&requested_output_names,
-            sizeof(off_t) * requested_output_count,
-            requested_output_names_offset));
-    python_infer_request->requested_output_names =
-        requested_output_names_offset;
-
-    // Append the list of requested outputs to the inference_request
-    for (size_t iidx = 0; iidx < requested_output_count; ++iidx) {
-      const char* requested_output_name;
-      RESPOND_ALL_AND_RETURN_IF_ERROR(
-          &responses, request_count,
-          TRITONBACKEND_RequestOutputName(
-              request, iidx, &requested_output_name));
-
-      // output name
-      off_t output_name_offset;
-      RESPOND_ALL_AND_RETURN_IF_EXCEPTION(
-          &responses, request_count,
-          SaveStringToSharedMemory(
-              shm_pool_, output_name_offset, requested_output_name));
-      requested_output_names[iidx] = output_name_offset;
-    }
-
-    // request id
-    const char* id;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        &responses, request_count, TRITONBACKEND_RequestId(request, &id));
-
-    off_t id_offset;
-    RESPOND_ALL_AND_RETURN_IF_EXCEPTION(
-        &responses, request_count,
-        SaveStringToSharedMemory(shm_pool_, id_offset, id));
-    python_infer_request->id = id_offset;
-
-    uint64_t correlation_id;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        &responses, request_count,
-        TRITONBACKEND_RequestCorrelationId(request, &correlation_id));
-    python_infer_request->correlation_id = correlation_id;
-  }
-
-  uint64_t compute_start_ns = 0;
-  SET_TIMESTAMP(compute_start_ns);
-
-  // This means that the stub process has exited and Python
-  // backend failed to restart the stub process.
-  if (stub_pid_ == 0) {
-    const char* error_message = "The stub process has exited unexpectedly.";
-    RespondErrorToAllRequests(
-        error_message, responses, requests, request_count);
-
-    // Update the shared memory offset so that we can reuse the shared memory
-    shm_pool_->SetOffset(request_batch_offset);
-    return nullptr;
-  }
-
-  // If parent fails to notify the stub or the stub fails to notify the
-  // parent in a timely manner, kill the stub process and restart the
-  // stub process.
-  if (!NotifyStub() || !WaitForStubNotification()) {
-    KillStubProcess();
-    const char* error_message = "The stub process has exited unexpectedly.";
-    LOG_MESSAGE(TRITONSERVER_LOG_ERROR, error_message);
-    TRITONSERVER_Error* err = StartStubProcess();
-    if (err == nullptr) {
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO, "Stub process successfully restarted.");
-    } else {
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_ERROR,
-          (std::string(
-               "Stub process failed to restart. Your future requests to "
-               "model ") +
-           name_ + " will fail. Error: " + TRITONSERVER_ErrorMessage(err))
-              .c_str());
-    }
-    RespondErrorToAllRequests(
-        error_message, responses, requests, request_count);
-
-    // Update the shared memory offset so that we can reuse the shared memory
-    shm_pool_->SetOffset(request_batch_offset);
-    return nullptr;
-  }
-
-  uint64_t compute_end_ns = 0;
-  SET_TIMESTAMP(compute_end_ns);
-
-  // Parsing the request response
-  ResponseBatch* response_batch;
-  RESPOND_ALL_AND_RETURN_IF_EXCEPTION(
-      &responses, request_count,
-      shm_pool_->MapOffset(
-          (char**)&response_batch, sizeof(ResponseBatch),
-          ipc_message_->response_batch));
-
-  // If inference fails, release all the requests and send an error response. If
-  // inference fails at this stage, it usually indicates a bug in the model code
-  if (response_batch->has_error) {
-    if (response_batch->is_error_set) {
-      char* error_message;
-      RESPOND_ALL_AND_RETURN_IF_EXCEPTION(
-          &responses, request_count,
-          LoadStringFromSharedMemory(
-              shm_pool_, response_batch->error, error_message));
-      RespondErrorToAllRequests(
-          error_message, responses, requests, request_count);
-    } else {
-      const char* error_message =
-          "Failed to fetch the error in response batch.";
-      RespondErrorToAllRequests(
-          error_message, responses, requests, request_count);
-    }
-
-    return nullptr;
-  }
-
-  Response* responses_shm;
-  RESPOND_ALL_AND_RETURN_IF_EXCEPTION(
-      &responses, request_count,
-      shm_pool_->MapOffset(
-          (char**)&responses_shm, sizeof(Response) * response_batch->batch_size,
-          response_batch->responses));
-
-
-  for (uint32_t r = 0; r < request_count; ++r) {
-    TRITONBACKEND_Response* response = responses[r];
-    TRITONBACKEND_Request* request = requests[r];
-    uint32_t requested_output_count = 0;
-
-    // Get response r
-    Response* response_shm = &responses_shm[r];
-
-    if (response_shm->has_error) {
-      try {
-        if (response_shm->is_error_set) {
-          char* err_string;
-          LoadStringFromSharedMemory(
-              shm_pool_, response_shm->error, err_string);
-          TRITONSERVER_Error* err =
-              TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_string);
-
-          LOG_IF_ERROR(
-              TRITONBACKEND_ResponseSend(
-                  responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
-              "failed sending response");
-          TRITONSERVER_ErrorDelete(err);
-        } else {
-          const char* err_string = "Failed to process response.";
-          TRITONSERVER_Error* err =
-              TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_string);
-
-          LOG_IF_ERROR(
-              TRITONBACKEND_ResponseSend(
-                  responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
-              "failed sending response");
-          TRITONSERVER_ErrorDelete(err);
-        }
-      }
-      catch (const PythonBackendException& pb_exception) {
-        TRITONSERVER_Error* err = CreateTritonErrorFromException(pb_exception);
-
-        LOG_IF_ERROR(
-            TRITONBACKEND_ResponseSend(
-                responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
-            "failed sending response");
-      }
-
-      responses[r] = nullptr;
-
-      // If has_error is true, we do not look at the response even if the
-      // response is set.
-      continue;
-    }
-
-    GUARDED_RESPOND_IF_ERROR(
-        responses, r,
-        TRITONBACKEND_RequestOutputCount(request, &requested_output_count));
-
-    Tensor* output_tensors;
-    GUARDED_RESPOND_IF_EXCEPTION(
-        responses, r,
-        shm_pool_->MapOffset(
-            (char**)&output_tensors, sizeof(Tensor) * requested_output_count,
-            response_shm->outputs));
-
-    bool cuda_copy = false;
-    std::set<std::string> requested_output_names;
-    for (size_t j = 0; j < requested_output_count; ++j) {
-      const char* output_name;
-      GUARDED_RESPOND_IF_ERROR(
-          responses, r,
-          TRITONBACKEND_RequestOutputName(request, j, &output_name));
-      requested_output_names.insert(output_name);
-    }
-
-    for (size_t j = 0; j < requested_output_count; ++j) {
-      Tensor* output_tensor = &output_tensors[j];
-      TRITONSERVER_DataType triton_dt = output_tensor->dtype;
-      size_t dims_count = output_tensor->dims_count;
-      int64_t* dims;
-      GUARDED_RESPOND_IF_EXCEPTION(
-          responses, r,
-          shm_pool_->MapOffset(
-              (char**)&dims, sizeof(int64_t) * dims_count,
-              output_tensor->dims));
-
-      char* name;
-      GUARDED_RESPOND_IF_EXCEPTION(
-          responses, r,
-          LoadStringFromSharedMemory(shm_pool_, output_tensor->name, name));
-
-      // Skip the output tensor if it is not in the list of requested outputs
-      if (requested_output_names.find(std::string(name)) ==
-          requested_output_names.end()) {
-        continue;
-      }
-
-      RawData* raw_data;
-      GUARDED_RESPOND_IF_EXCEPTION(
-          responses, r,
-          shm_pool_->MapOffset(
-              (char**)&raw_data, sizeof(RawData), output_tensor->raw_data));
-
-      char* data;
-      GUARDED_RESPOND_IF_EXCEPTION(
-          responses, r,
-          shm_pool_->MapOffset(
-              (char**)&data, raw_data->byte_size, raw_data->memory_ptr));
-
-      std::vector<int64_t> batch_shape(dims, dims + dims_count);
-      TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
-      int64_t actual_memory_type_id = 0;
-      void* buffer;
-
-      TRITONBACKEND_Output* response_output;
-      GUARDED_RESPOND_IF_ERROR(
-          responses, r,
-          TRITONBACKEND_ResponseOutput(
-              response, &response_output, name, triton_dt, batch_shape.data(),
-              batch_shape.size()));
-
-      bool cuda_used;
-      GUARDED_RESPOND_IF_ERROR(
-          responses, r,
-          TRITONBACKEND_OutputBuffer(
-              response_output, &buffer, raw_data->byte_size,
-              &actual_memory_type, &actual_memory_type_id));
-      CopyBuffer(
-          "Failed to copy string", TRITONSERVER_MEMORY_CPU /* memory_type */,
-          0 /* memory_type_id */, actual_memory_type, actual_memory_type_id,
-          raw_data->byte_size, data, buffer, CudaStream(), &cuda_used);
-      cuda_copy |= cuda_used;
-    }
-#ifdef TRITON_ENABLE_GPU
-    if (cuda_copy) {
-      cudaStreamSynchronize(stream_);
-    }
-#endif  // TRITON_ENABLE_GPU
-
-    // If error happens at this stage, we can only log it
-    LOG_IF_ERROR(
-        TRITONBACKEND_ResponseSend(
-            responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
-        "failed sending response");
-  }
-
-  uint64_t exec_end_ns = 0;
-  SET_TIMESTAMP(exec_end_ns);
-
-  for (uint32_t r = 0; r < request_count; ++r) {
-    TRITONBACKEND_Request* request = requests[r];
-
-    // Report statistics for the request. Note that there could
-    // still be responses that have not yet been sent but those
-    // cannot be captured in the statistics as they reflect only the
-    // request object. We use the execution start/end time for
-    // compute also so that the entire execution time is associated
-    // with the inference computation.
-    LOG_IF_ERROR(
-        TRITONBACKEND_ModelInstanceReportStatistics(
-            TritonModelInstance(), request,
-            (responses[r] != nullptr) /* success */, exec_start_ns,
-            compute_start_ns, compute_end_ns, exec_end_ns),
-        "failed reporting request statistics");
-  }
-
-  // Report the entire batch statistics. This backend does not support
-  // batching so the total batch size is always 1.
-  LOG_IF_ERROR(
-      TRITONBACKEND_ModelInstanceReportBatchStatistics(
-          TritonModelInstance(), total_batch_size, exec_start_ns,
-          compute_start_ns, compute_end_ns, exec_end_ns),
-      "failed reporting batch request statistics");
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("TRITONBACKEND_ModelInstanceExecute: model instance name ") +
-       Name() + " released " + std::to_string(request_count) + " requests")
-          .c_str());
-
-  // Update the shared memory offset so that we can reuse the shared memory
-  shm_pool_->SetOffset(request_batch_offset);
-  return nullptr;
-}
-
-bool
-ModelInstanceState::IsStubProcessAlive()
-{
-  boost::posix_time::ptime timeout =
-      boost::get_system_time() + boost::posix_time::seconds(1);
-  bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_, timeout);
-
-  // Check if lock has been acquired.
-  if (lock) {
-    return ipc_message_->health;
-  } else {
-    // If It failed to obtain the lock, it means that the stub has been
-    // stuck or exited while holding the health mutex lock.
-    return false;
-  }
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::StartStubProcess()
-{
-  stub_mutex_ = new (stub_mutex_) bi::interprocess_mutex;
-  health_mutex_ = new (health_mutex_) bi::interprocess_mutex;
-  stub_cond_ = new (stub_cond_) bi::interprocess_condition;
-
-  std::string kind = TRITONSERVER_InstanceGroupKindString(kind_);
-  std::string shm_region_name =
-      std::string("/") + Name() + "_" + kind + "_" + std::to_string(device_id_);
-
-  ModelState* model_state = reinterpret_cast<ModelState*>(Model());
-  int64_t shm_growth_size =
-      model_state->StateForBackend()->shm_growth_byte_size;
-  int64_t shm_default_size =
-      model_state->StateForBackend()->shm_default_byte_size;
-  const char* model_path = model_state->RepositoryPath().c_str();
-
-  initialized_ = false;
-
-  pid_t pid = fork();
-  if (pid < 0) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL, "Failed to fork the stub process.");
-  }
-
-  // Stub process
-  if (pid == 0) {
-    const char* stub_args[4];
-    stub_args[0] = "bash";
-    stub_args[1] = "-c";
-    stub_args[3] = nullptr;  // Last argument must be nullptr
-
-    // Default Python backend stub
-    std::string python_backend_stub =
-        model_state->StateForBackend()->python_lib +
-        "/triton_python_backend_stub";
-
-    // Path to alternative Python backend stub
-    std::string model_python_backend_stub =
-        std::string(model_path) + "/triton_python_backend_stub";
-
-    if (FileExists(model_python_backend_stub)) {
-      python_backend_stub = model_python_backend_stub;
-    }
-
-    std::stringstream ss;
-    ss << "exec " << python_backend_stub << " " << model_path_ << " "
-       << shm_region_name << " " << shm_default_size << " " << shm_growth_size
-       << " " << parent_pid_ << " "
-       << model_state->StateForBackend()->python_lib;
-
-    std::string bash_argument;
-    bash_argument = ss.str();
-    if (model_state->PythonExecutionEnv() != "") {
-      // Need to properly set the LD_LIBRARY_PATH so that Python environments
-      // using different python versions load properly.
-      bash_argument = "export LD_LIBRARY_PATH=" + path_to_libpython_ +
-                      ":$LD_LIBRARY_PATH; source " + path_to_activate_ +
-                      " && " + bash_argument;
-    }
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_VERBOSE,
-        (std::string("Starting Python backend stub: ") + bash_argument)
-            .c_str());
-
-    stub_args[2] = bash_argument.c_str();
-    if (execvp("bash", (char**)stub_args) == -1) {
-      std::stringstream ss;
-      ss << "Failed to run python backend stub. Errno = " << errno << '\n'
-         << "Python backend stub path: " << python_backend_stub << '\n'
-         << "Shared Memory Region Name: " << shm_region_name << '\n'
-         << "Shared Memory Default Byte Size: " << shm_default_size << '\n'
-         << "Shared Memory Growth Byte Size: " << shm_growth_size << '\n';
-      std::string log_message = ss.str();
-      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, log_message.c_str());
-
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("Failed to initialize model instance ") + Name())
-              .c_str());
-    }
-
-  } else {
-    int64_t stub_timeout_seconds =
-        model_state->StateForBackend()->stub_timeout_seconds;
-
-    stub_pid_ = pid;
-    boost::posix_time::ptime timeout =
-        boost::get_system_time() +
-        boost::posix_time::seconds(stub_timeout_seconds);
-
-    // Pre initialization step.
-    if (!parent_cond_->timed_wait(*parent_lock_, timeout)) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("Timed out occurred while waiting for the stub process. "
-                       "Failed to initialize model instance ") +
-           Name())
-              .c_str());
-    }
-
-    triton::common::TritonJson::WriteBuffer buffer;
-    Model()->ModelConfig().Write(&buffer);
-
-    std::unordered_map<std::string, std::string> initialize_args = {
-        {"model_config", buffer.MutableContents()},
-        {"model_instance_kind", TRITONSERVER_InstanceGroupKindString(kind_)},
-        {"model_instance_name", name_},
-        {"model_instance_device_id", std::to_string(device_id_)},
-        {"model_repository", model_state->RepositoryPath()},
-        {"model_version", std::to_string(model_state->Version())},
-        {"model_name", model_state->Name()}};
-
-    off_t initialize_args_offset;
-    RETURN_IF_EXCEPTION(SaveMapToSharedMemory(
-        shm_pool_, initialize_args_offset, initialize_args));
-    ipc_message_->request_batch = initialize_args_offset;
-
-    // If parent fails to notify the stub or the stub fails to notify the
-    // parent in a timely manner, kill the stub process and restart the
-    // stub process.
-    if (!NotifyStub() || !WaitForStubNotification()) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("Failed to initialize stub, stub process exited "
-                       "unexpectedly: ") +
-           name_)
-              .c_str());
-    }
-
-    ResponseBatch* response_batch;
-    RETURN_IF_EXCEPTION(shm_pool_->MapOffset(
-        (char**)&response_batch, sizeof(RequestBatch),
-        ipc_message_->response_batch));
-
-    if (response_batch->has_error) {
-      char* err_message;
-      RETURN_IF_EXCEPTION(LoadStringFromSharedMemory(
-          shm_pool_, response_batch->error, err_message));
-      return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message);
-    }
-
-    initialized_ = true;
-  }
-
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::SetupStubProcess()
-{
-  std::string kind = TRITONSERVER_InstanceGroupKindString(kind_);
-  std::string shm_region_name =
-      std::string("/") + Name() + "_" + kind + "_" + std::to_string(device_id_);
-
-  ModelState* model_state = reinterpret_cast<ModelState*>(Model());
-  int64_t shm_growth_size =
-      model_state->StateForBackend()->shm_growth_byte_size;
-  int64_t shm_default_size =
-      model_state->StateForBackend()->shm_default_byte_size;
-
-  try {
-    shm_pool_ = std::make_unique<SharedMemory>(
-        shm_region_name, shm_default_size, shm_growth_size,
-        true /* truncate */);
-  }
-  catch (const PythonBackendException& pb_exception) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL, pb_exception.what());
-  }
-
-  // Stub mutex and CV
-  bi::interprocess_mutex* stub_mutex;
-  off_t stub_mutex_offset;
-  RETURN_IF_EXCEPTION(shm_pool_->Map(
-      (char**)&stub_mutex, sizeof(bi::interprocess_mutex), stub_mutex_offset));
-  stub_mutex = new (stub_mutex) bi::interprocess_mutex;
-
-  bi::interprocess_condition* stub_cv;
-  off_t stub_cv_offset;
-  RETURN_IF_EXCEPTION(shm_pool_->Map(
-      (char**)&stub_cv, sizeof(bi::interprocess_condition), stub_cv_offset));
-  stub_cv = new (stub_cv) bi::interprocess_condition;
-
-  stub_cond_ = stub_cv;
-  stub_mutex_ = stub_mutex;
-
-  // Parent Mutex and CV
-  bi::interprocess_mutex* parent_mutex;
-  off_t parent_mutex_offset;
-  RETURN_IF_EXCEPTION(shm_pool_->Map(
-      (char**)&parent_mutex, sizeof(bi::interprocess_mutex),
-      parent_mutex_offset));
-  parent_mutex = new (parent_mutex) bi::interprocess_mutex;
-
-  bi::interprocess_condition* parent_cv;
-  off_t parent_cv_offset;
-  RETURN_IF_EXCEPTION(shm_pool_->Map(
-      (char**)&parent_cv, sizeof(bi::interprocess_condition),
-      parent_cv_offset));
-  parent_cv = new (parent_cv) bi::interprocess_condition;
-
-  bi::interprocess_mutex* health_mutex;
-  off_t health_mutex_offset;
-  RETURN_IF_EXCEPTION(shm_pool_->Map(
-      (char**)&health_mutex, sizeof(bi::interprocess_mutex),
-      health_mutex_offset));
-  health_mutex = new (health_mutex) bi::interprocess_mutex;
-
-  parent_cond_ = parent_cv;
-  parent_mutex_ = parent_mutex;
-  health_mutex_ = health_mutex;
-  parent_lock_ =
-      std::make_unique<bi::scoped_lock<bi::interprocess_mutex>>(*parent_mutex);
-
-  off_t ipc_offset;
-  RETURN_IF_EXCEPTION(
-      shm_pool_->Map((char**)&ipc_message_, sizeof(IPCMessage), ipc_offset));
-
-  uint64_t model_version = model_state->Version();
-  const char* model_path = model_state->RepositoryPath().c_str();
-
-  std::stringstream ss;
-  // Use <path>/version/model.py as the model location
-  ss << model_path << "/" << model_version << "/model.py";
-  model_path_ = ss.str();
-  struct stat buffer;
-
-  // Check if model.py exists
-  if (stat(model_path_.c_str(), &buffer) != 0) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        ("model.py does not exist in the model repository path: " + model_path_)
-            .c_str());
-  }
-
-  // Path to the extracted Python env
-  std::string python_execution_env = "";
-  if (model_state->PythonExecutionEnv() != "") {
-    try {
-      python_execution_env =
-          model_state->StateForBackend()->env_manager->ExtractIfNotExtracted(
-              model_state->PythonExecutionEnv());
-    }
-    catch (PythonBackendException& pb_exception) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL, pb_exception.what());
-    }
-
-    path_to_activate_ = python_execution_env + "/bin/activate";
-    path_to_libpython_ = python_execution_env + "/lib";
-    if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("Path ") + path_to_activate_ +
-           " does not exist. The Python environment should contain an "
-           "'activate' script.")
-              .c_str());
-    }
-  }
-
-  parent_pid_ = getpid();
-  RETURN_IF_ERROR(StartStubProcess());
-
-  return nullptr;
-}
-
-ModelInstanceState::~ModelInstanceState()
-{
-  if (initialized_) {
-    {
-      bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_);
-      ipc_message_->health = false;
-    }
-
-    // Sleep 1 second so that the child process has a chance to change the
-    // health variable
-    sleep(1);
-
-    bool healthy = false;
-    {
-      bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_);
-      healthy = ipc_message_->health;
-    }
-
-    if (healthy) {
-      // Signal to the termination to the Python backend stub using a request of
-      // size 0.
-      RequestBatch* request_batch;
-      off_t request_batch_offset;
-      shm_pool_->Map(
-          (char**)&request_batch, sizeof(RequestBatch), request_batch_offset);
-      request_batch->batch_size = 0;
-      ipc_message_->request_batch = request_batch_offset;
-
-      if (NotifyStub()) {
-        // Wait for stub notification
-        parent_cond_->wait(*parent_lock_);
-      }
-    }
-  }
-
-  // Terminate the stub process if it has been created.
-  if (stub_pid_ != 0) {
-    int status;
-    kill(stub_pid_, SIGTERM);
-    waitpid(stub_pid_, &status, 0);
-  }
-
-  // Destory the lock before deletion of shared memory is triggered.
-  parent_lock_.reset(nullptr);
-}
-
-TRITONSERVER_Error*
-ModelInstanceState::GetInputTensor(
-    const uint32_t input_idx, Tensor* input_tensor,
-    TRITONBACKEND_Request* request,
-    std::vector<TRITONBACKEND_Response*>& responses)
-{
-  const char* input_name;
-  // Load iidx'th input name
-  RETURN_IF_ERROR(
-      TRITONBACKEND_RequestInputName(request, input_idx, &input_name));
-
-  // Load iidx'th input
-  TRITONBACKEND_Input* in;
-  RETURN_IF_ERROR(TRITONBACKEND_RequestInput(request, input_name, &in));
-
-  // Load input properties
-  TRITONSERVER_DataType input_dtype;
-  const int64_t* input_shape;
-  uint32_t input_dims_count;
-  uint64_t input_byte_size;
-  uint32_t input_buffer_count;
-
-  RETURN_IF_ERROR(TRITONBACKEND_InputProperties(
-      in, &input_name, &input_dtype, &input_shape, &input_dims_count,
-      &input_byte_size, &input_buffer_count));
-
-  // If input_byte_size is larger than 2GBs, reject request the request.
-  uint64_t max_input_size = INT32_MAX;
-  if (input_byte_size > max_input_size) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNSUPPORTED,
-        "Python backend does not support input size larger than 2GBs, consider "
-        "partitioning your input into multiple inputs.");
-  }
-
-  // We need to create a new collector for every request because python backend
-  // sends each request individually to the python model
-  BackendInputCollector collector(
-      &request, 1, &responses, Model()->TritonMemoryManager(),
-      false /* pinned_enable */, CudaStream());
-
-  const TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
-  const int memory_type_id = 0;
-
-  char* input_buffer;
-  RETURN_IF_EXCEPTION(SaveTensorToSharedMemory(
-      shm_pool_, input_tensor, input_buffer, memory_type, memory_type_id,
-      input_byte_size, input_name, input_shape, input_dims_count, input_dtype));
-
-  // Load raw data into input_tensor raw data.
-  // FIXME: Avoid the copy to CPU Memory when
-  // the data is in GPU.
-  collector.ProcessTensor(
-      input_name, input_buffer, input_byte_size, memory_type, memory_type_id);
-
-  return nullptr;
-}
-
-TRITONSERVER_Error*
-ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
-{
-  try {
-    *state = new ModelState(triton_model);
-  }
-  catch (const BackendModelException& ex) {
-    RETURN_ERROR_IF_TRUE(
-        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
-        std::string("unexpected nullptr in BackendModelException"));
-    RETURN_IF_ERROR(ex.err_);
-  }
-
-  return nullptr;  // success
-}
-
-ModelState::ModelState(TRITONBACKEND_Model* triton_model)
-    : BackendModel(triton_model)
-{
-  TRITONBACKEND_Backend* backend;
-  THROW_IF_BACKEND_MODEL_ERROR(
-      TRITONBACKEND_ModelBackend(triton_model, &backend));
-
-  const char* path = nullptr;
-  TRITONBACKEND_ArtifactType artifact_type;
-  THROW_IF_BACKEND_MODEL_ERROR(
-      TRITONBACKEND_ModelRepository(triton_model, &artifact_type, &path));
-  python_execution_env_ = "";
-
-  void* bstate;
-  THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_BackendState(backend, &bstate));
-  backend_state_ = reinterpret_cast<BackendState*>(bstate);
-  triton::common::TritonJson::Value params;
-  if (model_config_.Find("parameters", &params)) {
-    // Skip the EXECUTION_ENV_PATH variable if it doesn't exist.
-    TRITONSERVER_Error* error =
-        GetParameterValue(params, "EXECUTION_ENV_PATH", &python_execution_env_);
-    if (error == nullptr) {
-      LOG_MESSAGE(
-          TRITONSERVER_LOG_INFO,
-          (std::string("Using Python execution env ") + python_execution_env_)
-              .c_str());
-    } else {
-      // Delete the error
-      TRITONSERVER_ErrorDelete(error);
-    }
-  }
-
-  if (artifact_type != TRITONBACKEND_ARTIFACT_FILESYSTEM) {
-    throw triton::backend::BackendModelException(TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNSUPPORTED,
-        (std::string("unsupported artifact type for model '") + Name() + "'")
-            .c_str()));
-  }
-}
-
-extern "C" {
-
-TRITONSERVER_Error*
-TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
-{
-  const char* cname;
-  RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
-  std::string name(cname);
-
-  // Check backend version to ensure compatibility
-  uint32_t api_version_major, api_version_minor;
-  RETURN_IF_ERROR(
-      TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("'") + name + "' TRITONBACKEND API version: " +
-       std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
-       std::to_string(TRITONBACKEND_API_VERSION_MINOR))
-          .c_str());
-
-  if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
-      (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNSUPPORTED,
-        "Triton backend API version does not support this backend");
-  }
-
-  TRITONSERVER_Message* backend_config_message;
-  RETURN_IF_ERROR(
-      TRITONBACKEND_BackendConfig(backend, &backend_config_message));
-
-  const char* buffer;
-  size_t byte_size;
-  RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(
-      backend_config_message, &buffer, &byte_size));
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("backend configuration:\n") + buffer).c_str());
-
-  triton::common::TritonJson::Value backend_config;
-  if (byte_size != 0) {
-    RETURN_IF_ERROR(backend_config.Parse(buffer, byte_size));
-  }
-
-  std::unique_ptr<BackendState> backend_state(new BackendState());
-  triton::common::TritonJson::Value cmdline;
-  backend_state->shm_default_byte_size = 64 * 1024 * 1024;  // 64 MBs
-  backend_state->shm_growth_byte_size = 64 * 1024 * 1024;   // 64 MBs
-  backend_state->stub_timeout_seconds = 30;
-
-  if (backend_config.Find("cmdline", &cmdline)) {
-    triton::common::TritonJson::Value shm_growth_size;
-    std::string shm_growth_byte_size;
-    if (cmdline.Find("shm-growth-byte-size", &shm_growth_size)) {
-      RETURN_IF_ERROR(shm_growth_size.AsString(&shm_growth_byte_size));
-      try {
-        backend_state->shm_growth_byte_size = std::stol(shm_growth_byte_size);
-        if (backend_state->shm_growth_byte_size <= 0) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              (std::string("shm-growth-byte-size") +
-               " can't be smaller than or equal to zero.")
-                  .c_str());
-        }
-      }
-      catch (const std::invalid_argument& ia) {
-        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what());
-      }
-    }
-
-    triton::common::TritonJson::Value shm_default_size;
-    std::string shm_default_byte_size;
-    if (cmdline.Find("shm-default-byte-size", &shm_default_size)) {
-      RETURN_IF_ERROR(shm_default_size.AsString(&shm_default_byte_size));
-      try {
-        backend_state->shm_default_byte_size = std::stol(shm_default_byte_size);
-        // Shared memory default byte size can't be less than 4 MBs.
-        if (backend_state->shm_default_byte_size < 4 * 1024 * 1024) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              (std::string("shm-default-byte-size") +
-               " can't be smaller than 4 MiBs")
-                  .c_str());
-        }
-      }
-      catch (const std::invalid_argument& ia) {
-        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what());
-      }
-    }
-
-    triton::common::TritonJson::Value stub_timeout_seconds;
-    std::string stub_timeout_string_seconds;
-    if (cmdline.Find("stub-timeout-seconds", &stub_timeout_seconds)) {
-      RETURN_IF_ERROR(
-          stub_timeout_seconds.AsString(&stub_timeout_string_seconds));
-      try {
-        backend_state->stub_timeout_seconds =
-            std::stol(stub_timeout_string_seconds);
-        if (backend_state->stub_timeout_seconds <= 0) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              (std::string("stub-timeout-seconds") +
-               " can't be smaller than or equal to zero.")
-                  .c_str());
-        }
-      }
-      catch (const std::invalid_argument& ia) {
-        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what());
-      }
-    }
-  }
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("shm-default-byte-size=") +
-       std::to_string(backend_state->shm_default_byte_size) +
-       ",shm-growth-byte-size=" +
-       std::to_string(backend_state->shm_growth_byte_size) +
-       ",stub-timeout-seconds=" +
-       std::to_string(backend_state->stub_timeout_seconds))
-          .c_str());
-
-  // Use BackendArtifacts to determine the location of Python files
-  const char* location;
-  TRITONBACKEND_ArtifactType artifact_type;
-  RETURN_IF_ERROR(
-      TRITONBACKEND_BackendArtifacts(backend, &artifact_type, &location));
-  backend_state->python_lib = location;
-  backend_state->env_manager = std::make_unique<EnvironmentManager>();
-
-  RETURN_IF_ERROR(TRITONBACKEND_BackendSetState(
-      backend, reinterpret_cast<void*>(backend_state.get())));
-
-  backend_state.release();
-  return nullptr;
-}
-
-TRITONSERVER_Error*
-TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend)
-{
-  LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "TRITONBACKEND_Finalize: Start");
-  void* vstate;
-  RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &vstate));
-  auto backend_state = reinterpret_cast<BackendState*>(vstate);
-  delete backend_state;
-  LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "TRITONBACKEND_Finalize: End");
-  return nullptr;  // success
-}
-
-TRITONSERVER_Error*
-TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
-{
-  const char* cname;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
-  std::string name(cname);
-
-  uint64_t version;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
-
-  TRITONSERVER_LogMessage(
-      TRITONSERVER_LOG_VERBOSE, __FILE__, __LINE__,
-      (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " +
-       std::to_string(version) + ")")
-          .c_str());
-
-  TRITONBACKEND_Backend* backend;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelBackend(model, &backend));
-
-  ModelState* model_state;
-  RETURN_IF_ERROR(ModelState::Create(model, &model_state));
-  RETURN_IF_ERROR(
-      TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
-
-  return nullptr;
-}
-
-TRITONSERVER_Error*
-TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
-{
-  void* vstate;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
-  ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      "TRITONBACKEND_ModelFinalize: delete model state");
-
-  delete model_state;
-
-  return nullptr;
-}
-
-TRITONSERVER_Error*
-TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
-{
-  const char* cname;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
-  std::string name(cname);
-
-  int32_t device_id;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id));
-  TRITONSERVER_InstanceGroupKind kind;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind));
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_INFO,
-      (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" +
-       TRITONSERVER_InstanceGroupKindString(kind) + " device " +
-       std::to_string(device_id) + ")")
-          .c_str());
-
-  TRITONBACKEND_Model* model;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
-
-  void* vmodelstate;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
-  ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
-
-  ModelInstanceState* instance_state;
-  RETURN_IF_ERROR(
-      ModelInstanceState::Create(model_state, instance, &instance_state));
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
-      instance, reinterpret_cast<void*>(instance_state)));
-
-  RETURN_IF_ERROR(instance_state->SetupStubProcess());
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      (std::string("TRITONBACKEND_ModelInstanceInitialize: instance "
-                   "initialization successful ") +
-       name + " (device " + std::to_string(device_id) + ")")
-          .c_str());
-
-  return nullptr;
-}
-
-TRITONSERVER_Error*
-TRITONBACKEND_ModelInstanceExecute(
-    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
-    const uint32_t request_count)
-{
-  ModelInstanceState* instance_state;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
-      instance, reinterpret_cast<void**>(&instance_state)));
-  RETURN_IF_ERROR(instance_state->ProcessRequests(requests, request_count));
-
-  for (uint32_t r = 0; r < request_count; ++r) {
-    TRITONBACKEND_Request* request = requests[r];
-
-    LOG_IF_ERROR(
-        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
-        "failed releasing request");
-  }
-
-  return nullptr;
-}
-
-TRITONSERVER_Error*
-TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
-{
-  void* vstate;
-  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
-  ModelInstanceState* instance_state =
-      reinterpret_cast<ModelInstanceState*>(vstate);
-
-  LOG_MESSAGE(
-      TRITONSERVER_LOG_VERBOSE,
-      "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
-
-  delete instance_state;
-
-  return nullptr;
-}
-
-}  // extern "C"
-
-}}}  // namespace triton::backend::python
diff --git a/src/python_be.cc b/src/python_be.cc
new file mode 100644
index 00000000..c152e035
--- /dev/null
+++ b/src/python_be.cc
@@ -0,0 +1,2465 @@
+// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include "python_be.h"
+
+#include <filesystem>
+
+#include "correlation_id.h"
+#include "gpu_buffers.h"
+#include "infer_payload.h"
+#include "model_loader.h"
+#include "pb_log.h"
+
+namespace triton { namespace backend { namespace python {
+
+namespace bi = boost::interprocess;
+
+ModelInstanceState::ModelInstanceState(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
+    : BackendModelInstance(model_state, triton_model_instance),
+      stub_to_parent_thread_(false)
+{
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::Create(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
+    ModelInstanceState** state)
+{
+  try {
+    *state = new ModelInstanceState(model_state, triton_model_instance);
+  }
+  catch (const BackendModelInstanceException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelInstanceException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::CheckIncomingRequests(
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    size_t& total_batch_size)
+{
+  ModelState* model_state = reinterpret_cast<ModelState*>(Model());
+  int max_batch_size = model_state->MaxBatchSize();
+
+  // For each request collect the total batch size for this inference
+  // execution. The batch-size, number of inputs, and size of each
+  // input has already been checked so don't need to do that here.
+  total_batch_size = 0;
+  for (size_t i = 0; i < request_count; i++) {
+    // If we get a nullptr request then something is badly wrong. Fail
+    // and release all requests.
+    if (requests[i] == nullptr) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          std::string(
+              "null request given to Python backend for '" + Name() + "'")
+              .c_str());
+    }
+  }
+
+  for (size_t i = 0; i < request_count; i++) {
+    if (max_batch_size > 0) {
+      // Retrieve the batch size from one of the inputs, if the model
+      // supports batching, the first dimension size is batch size
+      TRITONBACKEND_Input* input;
+      TRITONSERVER_Error* err =
+          TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
+      if (err == nullptr) {
+        const int64_t* shape;
+        err = TRITONBACKEND_InputProperties(
+            input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+        total_batch_size += shape[0];
+      }
+      if (err != nullptr) {
+        return err;
+      }
+    } else {
+      ++total_batch_size;
+    }
+  }
+
+  // If there are no valid payloads then no need to run the inference.
+  if (total_batch_size == 0) {
+    return nullptr;
+  }
+
+  // Make sure the maximum batch size is not exceeded. The
+  // total_batch_size must be 1 for models that don't support batching
+  // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
+  // scheduler has done something badly wrong so fail and release all
+  // requests.
+  if ((total_batch_size != 1) && (total_batch_size > (size_t)max_batch_size)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        std::string(
+            "batch size " + std::to_string(total_batch_size) + " for '" +
+            Name() + "', max allowed is " + std::to_string(max_batch_size))
+            .c_str());
+  }
+
+  return nullptr;  // success
+}
+
+bool
+ModelInstanceState::ExistsInClosedRequests(intptr_t closed_request)
+{
+  std::lock_guard<std::mutex> guard{closed_requests_mutex_};
+  return std::find(
+             closed_requests_.begin(), closed_requests_.end(),
+             closed_request) != closed_requests_.end();
+}
+
+void
+ModelInstanceState::SetErrorForResponseSendMessage(
+    ResponseSendMessage* response_send_message,
+    std::shared_ptr<TRITONSERVER_Error*> error,
+    std::unique_ptr<PbString>& error_message)
+{
+  if (error && *error != nullptr) {
+    response_send_message->has_error = true;
+    LOG_IF_EXCEPTION(
+        error_message = PbString::Create(
+            Stub()->ShmPool(), TRITONSERVER_ErrorMessage(*error)));
+    response_send_message->error = error_message->ShmHandle();
+    response_send_message->is_error_set = true;
+  }
+}
+
+bool
+ModelInstanceState::IsStubProcessAlive()
+{
+  boost::posix_time::ptime timeout =
+      boost::get_system_time() + boost::posix_time::seconds(1);
+  bi::scoped_lock<bi::interprocess_mutex> lock(*Stub()->HealthMutex(), timeout);
+
+  // Check if lock has been acquired.
+  if (lock) {
+    return Stub()->IpcControl()->stub_health;
+  } else {
+    // If It failed to obtain the lock, it means that the stub has been
+    // stuck or exited while holding the health mutex lock.
+    return false;
+  }
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::SaveRequestsToSharedMemory(
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    std::vector<std::unique_ptr<InferRequest>>& pb_infer_requests,
+    AllocatedSharedMemory<char>& request_batch,
+    std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses)
+{
+  // Clear any existing items in the requests vector
+  pb_infer_requests.clear();
+
+  ModelState* model_state = reinterpret_cast<ModelState*>(Model());
+  RETURN_IF_EXCEPTION(
+      request_batch = Stub()->ShmPool()->Construct<char>(
+          sizeof(RequestBatch) +
+          request_count * sizeof(bi::managed_external_buffer::handle_t)));
+
+  RequestBatch* request_batch_shm_ptr =
+      reinterpret_cast<RequestBatch*>(request_batch.data_.get());
+  request_batch_shm_ptr->batch_size = request_count;
+
+  bi::managed_external_buffer::handle_t* requests_shm =
+      reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+          request_batch.data_.get() + sizeof(RequestBatch));
+
+  for (uint32_t r = 0; r < request_count; ++r) {
+    TRITONBACKEND_Request* request = requests[r];
+    uint32_t requested_input_count = 0;
+    RETURN_IF_ERROR(
+        TRITONBACKEND_RequestInputCount(request, &requested_input_count));
+
+    uint32_t requested_output_count = 0;
+    RETURN_IF_ERROR(
+        TRITONBACKEND_RequestOutputCount(request, &requested_output_count));
+
+    std::vector<std::shared_ptr<PbTensor>> pb_input_tensors;
+    for (size_t iidx = 0; iidx < requested_input_count; ++iidx) {
+      std::shared_ptr<PbTensor> pb_input_tensor;
+
+      RETURN_IF_ERROR(
+          GetInputTensor(iidx, pb_input_tensor, request, responses));
+      pb_input_tensors.emplace_back(std::move(pb_input_tensor));
+    }
+
+    std::set<std::string> requested_output_names;
+    // Append the list of requested outputs to the inference_request
+    for (size_t iidx = 0; iidx < requested_output_count; ++iidx) {
+      const char* requested_output_name;
+      RETURN_IF_ERROR(TRITONBACKEND_RequestOutputName(
+          request, iidx, &requested_output_name));
+      requested_output_names.emplace(requested_output_name);
+    }
+
+    triton::common::TritonJson::Value parameters_json(
+        triton::common::TritonJson::ValueType::OBJECT);
+    uint32_t parameter_count;
+    RETURN_IF_ERROR(
+        TRITONBACKEND_RequestParameterCount(request, &parameter_count));
+    for (size_t i = 0; i < parameter_count; i++) {
+      const char* name;
+      TRITONSERVER_ParameterType type;
+      const void* vvalue;
+      RETURN_IF_ERROR(
+          TRITONBACKEND_RequestParameter(request, i, &name, &type, &vvalue));
+      if (type == TRITONSERVER_PARAMETER_INT) {
+        RETURN_IF_ERROR(parameters_json.AddInt(
+            name, *(reinterpret_cast<const int64_t*>(vvalue))));
+      } else if (type == TRITONSERVER_PARAMETER_BOOL) {
+        RETURN_IF_ERROR(parameters_json.AddBool(
+            name, *(reinterpret_cast<const bool*>(vvalue))));
+      } else if (type == TRITONSERVER_PARAMETER_STRING) {
+        std::string string = reinterpret_cast<const char*>(vvalue);
+        RETURN_IF_ERROR(parameters_json.AddString(name, string));
+      } else if (type == TRITONSERVER_PARAMETER_DOUBLE) {
+        RETURN_IF_ERROR(parameters_json.AddDouble(
+            name, *(reinterpret_cast<const double*>(vvalue))));
+      } else {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string("Unsupported parameter type for parameter '") + name +
+             "'.")
+                .c_str());
+      }
+    }
+
+    triton::common::TritonJson::WriteBuffer buffer;
+    RETURN_IF_ERROR(parameters_json.Write(&buffer));
+    const auto& parameters_string = buffer.Contents();
+
+    // request id
+    const char* id;
+    RETURN_IF_ERROR(TRITONBACKEND_RequestId(request, &id));
+
+    uint64_t correlation_id_uint = 0;
+    CorrelationId correlation_id;
+
+    auto error =
+        TRITONBACKEND_RequestCorrelationId(request, &correlation_id_uint);
+    if (error != nullptr) {
+      TRITONSERVER_ErrorDelete(error);
+      const char* correlation_id_string = "";
+      RETURN_IF_ERROR(TRITONBACKEND_RequestCorrelationIdString(
+          request, &correlation_id_string));
+      correlation_id = CorrelationId(std::string(correlation_id_string));
+    } else {
+      correlation_id = CorrelationId(correlation_id_uint);
+    }
+
+    uint32_t flags;
+    RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(request, &flags));
+
+    // Do not return if error in this case, because Triton core
+    // will return an error if tracing is disabled (see PYBE PR#295).
+    // For the same reason, we do not log the error message, otherwise
+    // when Triton is compiled without tracing, it'll constantly log
+    // this error.
+    TRITONSERVER_InferenceTrace* triton_trace;
+    auto err = TRITONBACKEND_RequestTrace(request, &triton_trace);
+    if (err != nullptr) {
+      triton_trace = nullptr;
+      TRITONSERVER_ErrorDelete(err);
+    }
+    const char* val = nullptr;
+    if (triton_trace != nullptr) {
+      LOG_IF_ERROR(
+          TRITONSERVER_InferenceTraceContext(triton_trace, &val),
+          "failed to retrieve trace context");
+    }
+    std::string context = (val != nullptr) ? std::string(val) : "";
+
+    InferenceTrace trace =
+        InferenceTrace(reinterpret_cast<void*>(triton_trace), context);
+
+    uint64_t request_timeout;
+    RETURN_IF_ERROR(TRITONBACKEND_InferenceRequestTimeoutMicroseconds(
+        request, &request_timeout));
+
+    std::unique_ptr<InferRequest> infer_request;
+    TRITONBACKEND_ResponseFactory* factory_ptr = nullptr;
+    RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request));
+
+    infer_request = std::make_unique<InferRequest>(
+        id, correlation_id, pb_input_tensors, requested_output_names,
+        model_state->Name(), model_state->Version(), parameters_string, flags,
+        request_timeout, reinterpret_cast<intptr_t>(factory_ptr),
+        reinterpret_cast<intptr_t>(request),
+        PreferredMemory(PreferredMemory::kDefault, 0), trace);
+    RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool()));
+    requests_shm[r] = infer_request->ShmHandle();
+    pb_infer_requests.emplace_back(std::move(infer_request));
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::LaunchStubProcess()
+{
+  ModelState* model_state = reinterpret_cast<ModelState*>(Model());
+  Stub() = std::make_unique<StubLauncher>(
+      "MODEL_INSTANCE_STUB", Name(), DeviceId(),
+      TRITONSERVER_InstanceGroupKindString(Kind()));
+  RETURN_IF_ERROR(Stub()->Initialize(model_state));
+  RETURN_IF_ERROR(Stub()->Setup());
+  StartMonitor();
+  RETURN_IF_ERROR(Stub()->Launch());
+
+  thread_pool_ = std::make_unique<boost::asio::thread_pool>(
+      model_state->StateForBackend()->thread_pool_size);
+
+  request_executor_ = std::make_unique<RequestExecutor>(
+      Stub()->ShmPool(), model_state->TritonServer());
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::GetInputTensor(
+    const uint32_t input_idx, std::shared_ptr<PbTensor>& input_tensor,
+    TRITONBACKEND_Request* request,
+    std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses)
+{
+  NVTX_RANGE(nvtx_, "GetInputTensor " + Name());
+  const char* input_name;
+  // Load iidx'th input name
+  RETURN_IF_ERROR(
+      TRITONBACKEND_RequestInputName(request, input_idx, &input_name));
+
+  // Load iidx'th input
+  TRITONBACKEND_Input* in;
+  RETURN_IF_ERROR(TRITONBACKEND_RequestInput(request, input_name, &in));
+
+  // Load input properties
+  TRITONSERVER_DataType input_dtype;
+  const int64_t* input_shape;
+  uint32_t input_dims_count;
+  uint64_t input_byte_size;
+  uint32_t input_buffer_count;
+
+  RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
+      in, HostPolicyName().c_str(), &input_name, &input_dtype, &input_shape,
+      &input_dims_count, &input_byte_size, &input_buffer_count));
+
+  // Only use input collector when a response array is provided.
+  std::unique_ptr<BackendInputCollector> collector;
+  if (responses) {
+    collector = std::make_unique<BackendInputCollector>(
+        &request, 1, responses.get(), Model()->TritonMemoryManager(),
+        false /* pinned_enable */, CudaStream(), nullptr, nullptr, 0,
+        HostPolicyName().c_str());
+  }
+
+  ModelState* model_state = reinterpret_cast<ModelState*>(Model());
+  bool cpu_only_tensors = model_state->ForceCPUOnlyInputTensors();
+
+  if (input_dtype == TRITONSERVER_TYPE_BYTES) {
+    cpu_only_tensors = true;
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  CUDAHandler& cuda_handler = CUDAHandler::getInstance();
+  // If CUDA driver API is not available, the input tensors will be moved to
+  // CPU.
+  if (!cuda_handler.IsAvailable() && !cpu_only_tensors) {
+    if (!cuda_handler.GetErrorString().empty()) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_WARN, (std::string(
+                                      "Forcing CPU only input tensors: " +
+                                      cuda_handler.GetErrorString()))
+                                     .c_str());
+    }
+    cuda_handler.ClearErrorString();
+    cpu_only_tensors = true;
+  }
+#endif
+
+  TRITONSERVER_MemoryType src_memory_type;
+  int64_t src_memory_type_id;
+  size_t src_byte_size;
+  const void* src_ptr;
+  RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+      in, 0 /* input buffer index */, &src_ptr, &src_byte_size,
+      &src_memory_type, &src_memory_type_id));
+
+// If TRITON_ENABLE_GPU is false, we need to copy the tensors
+// to the CPU.
+#ifndef TRITON_ENABLE_GPU
+  cpu_only_tensors = true;
+#endif  // TRITON_ENABLE_GPU
+
+  if (cpu_only_tensors || src_memory_type != TRITONSERVER_MEMORY_GPU) {
+    input_tensor = std::make_shared<PbTensor>(
+        std::string(input_name),
+        std::vector<int64_t>(input_shape, input_shape + input_dims_count),
+        input_dtype, TRITONSERVER_MEMORY_CPU /* memory_type */,
+        0 /* memory_type_id */, nullptr /* buffer ptr*/, input_byte_size,
+        nullptr /* DLManagedTensor */);
+    RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory(
+        Stub()->ShmPool(), false /* copy_gpu */));
+    char* input_buffer = reinterpret_cast<char*>(input_tensor->DataPtr());
+
+    if (collector) {
+      collector->ProcessTensor(
+          input_name, input_buffer, input_byte_size,
+          TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */);
+    } else {
+      size_t byte_size = input_byte_size;
+      RETURN_IF_ERROR(backend::ReadInputTensor(
+          request, input_name, input_buffer, &byte_size));
+    }
+
+    if (input_dtype == TRITONSERVER_TYPE_BYTES) {
+      const char* content = reinterpret_cast<char*>(input_tensor->DataPtr());
+      size_t content_byte_size = input_tensor->ByteSize();
+      int64_t request_element_cnt = 0;
+      RETURN_IF_ERROR(
+          GetElementCount(input_tensor->Dims(), &request_element_cnt));
+      RETURN_IF_ERROR(ValidateStringBuffer(
+          content, content_byte_size, request_element_cnt, input_name,
+          nullptr /* str_list */));
+    }
+  } else {
+#ifdef TRITON_ENABLE_GPU
+    // Attempt to use the cuda shared memory pool for GPU tensor.
+    ShareCUDAMemoryPool(src_memory_type_id);
+
+    // Retrieving GPU input tensors
+    const void* buffer = nullptr;
+    std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
+    alloc_perference = {{TRITONSERVER_MEMORY_GPU, src_memory_type_id}};
+
+    // collector is used in the non-decoupled mode.
+    if (collector) {
+      // The ProcessTensor function will try to allocate the buffer in the CUDA
+      // pool first.
+      RETURN_IF_ERROR(collector->ProcessTensor(
+          input_name, nullptr, 0, alloc_perference,
+          reinterpret_cast<const char**>(&buffer), &input_byte_size,
+          &src_memory_type, &src_memory_type_id));
+      // If the tensor is using the cuda shared memory, we need to extract the
+      // handle that was used to create the device pointer. This is because of a
+      // limitation in the legacy CUDA IPC API that doesn't allow getting the
+      // handle of an exported pointer. If the cuda handle exists, it indicates
+      // that the cuda shared memory was used and the input is in a single
+      // buffer.
+      // [FIXME] For the case where the input is in cuda shared memory and uses
+      // multiple input buffers this needs to be changed.
+      TRITONSERVER_BufferAttributes* buffer_attributes;
+
+      // This value is not used.
+      const void* buffer_p;
+      RETURN_IF_ERROR(TRITONBACKEND_InputBufferAttributes(
+          in, 0, &buffer_p, &buffer_attributes));
+
+      input_tensor = std::make_shared<PbTensor>(
+          std::string(input_name),
+          std::vector<int64_t>(input_shape, input_shape + input_dims_count),
+          input_dtype, src_memory_type, src_memory_type_id,
+          const_cast<void*>(buffer), input_byte_size,
+          nullptr /* DLManagedTensor */);
+
+      cudaIpcMemHandle_t* cuda_ipc_handle;
+      RETURN_IF_ERROR(TRITONSERVER_BufferAttributesCudaIpcHandle(
+          buffer_attributes, reinterpret_cast<void**>(&cuda_ipc_handle)));
+      if (cuda_ipc_handle != nullptr) {
+        RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory(
+            Stub()->ShmPool(), false /* copy_gpu */));
+        RETURN_IF_EXCEPTION(
+            input_tensor->Memory()->SetCudaIpcHandle(cuda_ipc_handle));
+      } else {
+        RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory(
+            Stub()->ShmPool(), true /* copy_gpu */));
+      }
+    } else {
+      // Try to use the cuda shared memory pool first.
+      void* dev_ptr;
+      BackendMemory* backend_memory;
+      std::unique_ptr<BackendMemory> lbackend_memory;
+      RETURN_IF_ERROR(BackendMemory::Create(
+          reinterpret_cast<TRITONBACKEND_MemoryManager*>(
+              Stub()
+                  ->ShmPool()
+                  ->GetCUDAMemoryPoolManager()
+                  ->TritonMemoryManager()),
+          {BackendMemory::AllocationType::GPU_POOL,
+           BackendMemory::AllocationType::GPU},
+          src_memory_type_id, input_byte_size, &backend_memory));
+
+      dev_ptr = backend_memory->MemoryPtr();
+      lbackend_memory.reset(backend_memory);
+
+      size_t byte_size = input_byte_size;
+
+      bool cuda_used = false;
+      RETURN_IF_ERROR(backend::ReadInputTensor(
+          request, input_name, reinterpret_cast<char*>(dev_ptr), &byte_size,
+          TRITONSERVER_MEMORY_GPU, src_memory_type_id, CudaStream(),
+          &cuda_used));
+
+      if (cuda_used) {
+#ifdef TRITON_ENABLE_GPU
+        cudaStreamSynchronize(stream_);
+#endif
+      }
+
+      input_tensor = std::make_shared<PbTensor>(
+          std::string(input_name),
+          std::vector<int64_t>(input_shape, input_shape + input_dims_count),
+          input_dtype, src_memory_type, src_memory_type_id,
+          const_cast<void*>(dev_ptr), input_byte_size,
+          nullptr /* DLManagedTensor */);
+
+      input_tensor->SetMemory(std::move(
+          PbMemory::Create(Stub()->ShmPool(), std::move(lbackend_memory))));
+
+      RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory(
+          Stub()->ShmPool(), true /* copy_gpu */));
+    }
+#else
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "Python backend does not support GPU tensors.");
+#endif  // TRITON_ENABLE_GPU
+  }
+
+  return nullptr;
+}
+
+void
+ModelInstanceState::ExecuteBLSRequest(
+    std::shared_ptr<IPCMessage> ipc_message, const bool is_decoupled)
+{
+  bool is_response_batch_set = false;
+  std::unique_ptr<InferResponse> infer_response;
+  ResponseBatch* response_batch = nullptr;
+  std::unique_ptr<PbString> pb_error_message;
+  std::unique_ptr<IPCMessage> bls_response;
+  AllocatedSharedMemory<char> response_batch_shm;
+  bi::managed_external_buffer::handle_t* response_handle = nullptr;
+
+  try {
+    bls_response =
+        IPCMessage::Create(Stub()->ShmPool(), false /* inline_response */);
+
+    AllocatedSharedMemory<char> request_batch =
+        Stub()->ShmPool()->Load<char>(ipc_message->Args());
+    RequestBatch* request_batch_shm_ptr =
+        reinterpret_cast<RequestBatch*>(request_batch.data_.get());
+
+    bls_response->Command() = PYTHONSTUB_InferExecResponse;
+    ipc_message->ResponseHandle() = bls_response->ShmHandle();
+
+    // The response batch of the handle will contain a ResponseBatch
+    PrepareResponseBatch(
+        &response_batch, response_batch_shm, &bls_response, &response_handle);
+
+    is_response_batch_set = true;
+    bool has_gpu_tensor = false;
+    GPUBuffersHelper gpu_buffer_helper;
+
+    PythonBackendException pb_exception(std::string{});
+    if (request_batch_shm_ptr->batch_size == 1) {
+      std::shared_ptr<InferRequest> infer_request;
+      bi::managed_external_buffer::handle_t* request_handle =
+          reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+              request_batch.data_.get() + sizeof(RequestBatch));
+      infer_request = InferRequest::LoadFromSharedMemory(
+          Stub()->ShmPool(), *request_handle, false /* open_cuda_handle */,
+          nullptr /* is_model_decoupled */);
+
+      // If the BLS inputs are in GPU an additional round trip between the
+      // stub process and the main process is required. The reason is that we
+      // need to first allocate the GPU memory from the memory pool and then
+      // ask the stub process to fill in those allocated buffers.
+      try {
+        for (auto& input_tensor : infer_request->Inputs()) {
+          if (!input_tensor->IsCPU()) {
+#ifdef TRITON_ENABLE_GPU
+            // Attempt to use the cuda shared memory pool for GPU tensor.
+            ShareCUDAMemoryPool(input_tensor->MemoryTypeId());
+            BackendMemory* backend_memory;
+            std::unique_ptr<BackendMemory> lbackend_memory;
+            has_gpu_tensor = true;
+            TRITONSERVER_Error* error = BackendMemory::Create(
+                Model()->TritonMemoryManager(),
+                {BackendMemory::AllocationType::GPU_POOL,
+                 BackendMemory::AllocationType::GPU},
+                input_tensor->MemoryTypeId(), input_tensor->ByteSize(),
+                &backend_memory);
+            if (error != nullptr) {
+              LOG_MESSAGE(
+                  TRITONSERVER_LOG_ERROR, TRITONSERVER_ErrorMessage(error));
+              break;
+            }
+            lbackend_memory.reset(backend_memory);
+            input_tensor->SetMemory(std::move(PbMemory::Create(
+                Stub()->ShmPool(), std::move(lbackend_memory))));
+            gpu_buffer_helper.AddBuffer(input_tensor->Memory()->ShmHandle());
+#endif  // TRITON_ENABLE_GPU
+          }
+        }
+      }
+      catch (const PythonBackendException& exception) {
+        gpu_buffer_helper.SetError(Stub()->ShmPool(), exception.what());
+        pb_exception = exception;
+      }
+
+      // Wait for the extra round trip to complete. The stub process will fill
+      // in the data for the GPU tensors. If there is an error, the extra round
+      // trip must be still completed, otherwise the stub process will always be
+      // waiting for a message from the parent process.
+      if (has_gpu_tensor) {
+        gpu_buffer_helper.Complete(Stub()->ShmPool());
+        request_batch_shm_ptr->gpu_buffers_handle =
+            gpu_buffer_helper.ShmHandle();
+
+        bi::scoped_lock<bi::interprocess_mutex> lock{
+            *(ipc_message->ResponseMutex())};
+        ipc_message->ResponseCondition()->notify_all();
+        ipc_message->ResponseCondition()->wait(lock);
+      }
+
+      if (pb_exception.what() == std::string{""}) {
+        auto callback = std::bind(
+            &ModelInstanceState::SendBLSDecoupledResponse, this,
+            std::placeholders::_1);
+        std::shared_ptr<InferPayload> infer_payload =
+            std::make_shared<InferPayload>(is_decoupled, callback);
+
+        auto response_future =
+            request_executor_->Infer(infer_request, infer_payload);
+        infer_response = response_future.get();
+
+        if (is_decoupled && (infer_response->Id() != nullptr)) {
+          // Need to manage the lifetime of InferPayload object for bls
+          // decoupled responses.
+          std::lock_guard<std::mutex> lock(infer_payload_mu_);
+          infer_payload_[reinterpret_cast<intptr_t>(infer_payload.get())] =
+              infer_payload;
+        }
+
+        PrepareResponseHandle(&infer_response, response_handle);
+      } else {
+        throw pb_exception;
+      }
+    }
+  }
+  catch (const PythonBackendException& pb_exception) {
+    if (is_response_batch_set) {
+      response_batch->has_error = true;
+      LOG_IF_EXCEPTION(
+          pb_error_message =
+              PbString::Create(Stub()->ShmPool(), pb_exception.what()));
+
+      if (pb_error_message != nullptr) {
+        response_batch->is_error_set = true;
+        response_batch->error = pb_error_message->ShmHandle();
+      }
+    } else {
+      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, pb_exception.what());
+    }
+  }
+
+  // At this point, the stub has notified the parent process that it has
+  // finished loading the inference response from shared memory.
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lock{
+        *(ipc_message->ResponseMutex())};
+    ipc_message->ResponseCondition()->notify_all();
+    ipc_message->ResponseCondition()->wait(lock);
+  }
+}
+
+void
+ModelInstanceState::StubToParentMQMonitor()
+{
+  while (stub_to_parent_thread_) {
+    bi::managed_external_buffer::handle_t handle =
+        Stub()->StubToParentMessageQueue()->Pop();
+    if (handle == DUMMY_MESSAGE) {
+      break;
+    }
+    std::unique_ptr<IPCMessage> message =
+        IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), handle);
+
+    switch (message->Command()) {
+      case PYTHONSTUB_LogRequest: {
+        ProcessLogRequest(message);
+        break;
+      }
+      case PYTHONSTUB_BLSDecoupledInferPayloadCleanup:
+      case PYTHONSTUB_DecoupledResponseFactoryCleanup: {
+        ProcessCleanupRequest(message);
+        break;
+      }
+      case PYTHONSTUB_IsRequestCancelled: {
+        ProcessIsRequestCancelled(message);
+        break;
+      }
+      case PYTHONSTUB_MetricFamilyRequestNew:
+      case PYTHONSTUB_MetricFamilyRequestDelete: {
+        ProcessMetricFamilyRequest(message);
+        break;
+      }
+      case PYTHONSTUB_MetricRequestNew:
+      case PYTHONSTUB_MetricRequestDelete:
+      case PYTHONSTUB_MetricRequestValue:
+      case PYTHONSTUB_MetricRequestIncrement:
+      case PYTHONSTUB_MetricRequestSet:
+      case PYTHONSTUB_MetricRequestObserve: {
+        ProcessMetricRequest(message);
+        break;
+      }
+      case PYTHONSTUB_ModelReadinessRequest:
+      case PYTHONSTUB_LoadModelRequest:
+      case PYTHONSTUB_UnloadModelRequest: {
+        ProcessModelControlRequest(message);
+        break;
+      }
+      case PYTHONSTUB_ResponseSend: {
+        std::shared_ptr<IPCMessage> response_send_message = std::move(message);
+        std::packaged_task<void()> task([this, response_send_message] {
+          ResponseSendDecoupled(response_send_message);
+        });
+        boost::asio::post(*thread_pool_, std::move(task));
+        break;
+      }
+      case PYTHONSTUB_InferExecRequest:
+      case PYTHONSTUB_InferStreamExecRequest: {
+        std::shared_ptr<IPCMessage> bls_execute = std::move(message);
+        std::packaged_task<void()> task([this, bls_execute] {
+          ExecuteBLSRequest(
+              bls_execute,
+              (bls_execute->Command() == PYTHONSTUB_InferStreamExecRequest));
+        });
+        boost::asio::post(*thread_pool_, std::move(task));
+        break;
+      }
+      case PYTHONSTUB_CancelBLSInferRequest: {
+        ProcessCancelBLSRequest(message);
+        break;
+      }
+      default: {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_ERROR, "Unexpected message type received.");
+        break;
+      }
+    }
+  }
+}
+
+void
+ModelInstanceState::ProcessLogRequest(
+    const std::unique_ptr<IPCMessage>& message)
+{
+  AllocatedSharedMemory<LogSendMessage> log_message_response =
+      Stub()->ShmPool()->Load<LogSendMessage>(message->Args());
+  std::unique_ptr<PbLog> pb_log_message =
+      PbLogShm::LoadFromSharedMemory(Stub()->ShmPool(), message->Args());
+
+  const std::string& filename = pb_log_message->Filename();
+  uint32_t line = pb_log_message->Line();
+  const std::string& log_message = pb_log_message->Message();
+  LogLevel level = pb_log_message->Level();
+
+  switch (level) {
+    case LogLevel::kInfo: {
+      TRITONSERVER_LogMessage(
+          TRITONSERVER_LOG_INFO, (filename.c_str()), line,
+          (log_message.c_str()));
+      break;
+    }
+    case LogLevel::kWarning: {
+      TRITONSERVER_LogMessage(
+          TRITONSERVER_LOG_WARN, (filename.c_str()), line,
+          (log_message.c_str()));
+      break;
+    }
+    case LogLevel::kError: {
+      TRITONSERVER_LogMessage(
+          TRITONSERVER_LOG_ERROR, (filename.c_str()), line,
+          (log_message.c_str()));
+      break;
+    }
+    case LogLevel::kVerbose: {
+      TRITONSERVER_LogMessage(
+          TRITONSERVER_LOG_VERBOSE, (filename.c_str()), line,
+          (log_message.c_str()));
+      break;
+    }
+  }
+  // Send confirmation back to pb_stub.cc that the message
+  // was received.
+  LogSendMessage* send_message_payload =
+      reinterpret_cast<LogSendMessage*>(log_message_response.data_.get());
+  {
+    bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
+    send_message_payload->waiting_on_stub = true;
+    send_message_payload->cv.notify_all();
+    while (send_message_payload->waiting_on_stub) {
+      send_message_payload->cv.wait(guard);
+    }
+  }
+}
+
+void
+ModelInstanceState::ProcessCleanupRequest(
+    const std::unique_ptr<IPCMessage>& message)
+{
+  AllocatedSharedMemory<char> cleanup_request_message =
+      Stub()->ShmPool()->Load<char>(message->Args());
+  CleanupMessage* cleanup_message_ptr =
+      reinterpret_cast<CleanupMessage*>(cleanup_request_message.data_.get());
+  intptr_t id = reinterpret_cast<intptr_t>(cleanup_message_ptr->id);
+  if (message->Command() == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) {
+    // Remove the InferPayload object from the map.
+    std::lock_guard<std::mutex> lock(infer_payload_mu_);
+    infer_payload_.erase(id);
+  } else if (message->Command() == PYTHONSTUB_DecoupledResponseFactoryCleanup) {
+    // Delete response factory
+    std::unique_ptr<
+        TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter>
+        response_factory(reinterpret_cast<TRITONBACKEND_ResponseFactory*>(id));
+  }
+
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lock{*(message->ResponseMutex())};
+    cleanup_message_ptr->waiting_on_stub = true;
+    message->ResponseCondition()->notify_all();
+  }
+}
+
+void
+ModelInstanceState::ProcessCancelBLSRequest(
+    const std::unique_ptr<IPCMessage>& message)
+{
+  AllocatedSharedMemory<CancelBLSRequestMessage> message_shm =
+      Stub()->ShmPool()->Load<CancelBLSRequestMessage>(message->Args());
+  CancelBLSRequestMessage* message_payload =
+      reinterpret_cast<CancelBLSRequestMessage*>(message_shm.data_.get());
+
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lk{message_payload->mu};
+
+    intptr_t id = reinterpret_cast<intptr_t>(message_payload->infer_payload_id);
+    try {
+      {
+        std::lock_guard<std::mutex> lock(infer_payload_mu_);
+        if (infer_payload_.find(id) != infer_payload_.end()) {
+          infer_payload_[id]->SafeCancelRequest();
+        }
+      }
+      message_payload->is_cancelled = true;
+    }
+    catch (const PythonBackendException& pb_exception) {
+      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, pb_exception.what());
+    }
+
+    message_payload->waiting_on_stub = true;
+    message_payload->cv.notify_all();
+    while (message_payload->waiting_on_stub) {
+      message_payload->cv.wait(lk);
+    }
+  }
+}
+
+void
+ModelInstanceState::ProcessIsRequestCancelled(
+    const std::unique_ptr<IPCMessage>& message)
+{
+  AllocatedSharedMemory<IsCancelledMessage> message_shm =
+      Stub()->ShmPool()->Load<IsCancelledMessage>(message->Args());
+  IsCancelledMessage* message_payload =
+      reinterpret_cast<IsCancelledMessage*>(message_shm.data_.get());
+
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lk{message_payload->mu};
+
+    if (message_payload->response_factory_address != 0) {
+      TRITONBACKEND_ResponseFactory* response_factory =
+          reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
+              message_payload->response_factory_address);
+      TRITONBACKEND_ResponseFactoryIsCancelled(
+          response_factory, &message_payload->is_cancelled);
+    } else if (message_payload->request_address != 0) {
+      TRITONBACKEND_Request* request = reinterpret_cast<TRITONBACKEND_Request*>(
+          message_payload->request_address);
+      TRITONBACKEND_RequestIsCancelled(request, &message_payload->is_cancelled);
+    } else {
+      throw PythonBackendException("Cannot determine request cancellation");
+    }
+
+    message_payload->waiting_on_stub = true;
+    message_payload->cv.notify_all();
+    while (message_payload->waiting_on_stub) {
+      message_payload->cv.wait(lk);
+    }
+  }
+}
+
+template <typename T, typename MessageType>
+void
+ModelInstanceState::ProcessMessage(
+    const std::unique_ptr<IPCMessage>& ipc_message,
+    std::function<void(std::unique_ptr<T>&, MessageType*)> request_handler)
+{
+  AllocatedSharedMemory<MessageType> message =
+      Stub()->ShmPool()->Load<MessageType>(ipc_message->Args());
+  MessageType* message_ptr =
+      reinterpret_cast<MessageType*>(message.data_.get());
+  std::unique_ptr<PbString> pb_error_message;
+  PythonBackendException pb_exception(std::string{});
+  std::unique_ptr<T> object =
+      T::LoadFromSharedMemory(Stub()->ShmPool(), message_ptr->message);
+
+  ScopedDefer _([message_ptr] {
+    {
+      bi::scoped_lock<bi::interprocess_mutex> guard{message_ptr->mu};
+      message_ptr->waiting_on_stub = true;
+      message_ptr->cv.notify_all();
+      while (message_ptr->waiting_on_stub) {
+        message_ptr->cv.wait(guard);
+      }
+    }
+  });
+
+  try {
+    request_handler(object, message_ptr);
+  }
+  catch (const PythonBackendException& exception) {
+    pb_exception = exception;
+  }
+
+  if (pb_exception.what() != std::string{}) {
+    message_ptr->has_error = true;
+    LOG_IF_EXCEPTION(
+        pb_error_message =
+            PbString::Create(Stub()->ShmPool(), pb_exception.what()));
+    message_ptr->error = pb_error_message->ShmHandle();
+    message_ptr->is_error_set = true;
+  }
+}
+
+void
+ModelInstanceState::ProcessMetricFamilyRequest(
+    const std::unique_ptr<IPCMessage>& message)
+{
+  auto command = message->Command();
+  ProcessMessage<MetricFamily, CustomMetricsMessage>(
+      message, [this, command](
+                   std::unique_ptr<MetricFamily>& metric_family,
+                   CustomMetricsMessage* metrics_message_ptr) {
+        switch (command) {
+          case PYTHONSTUB_MetricFamilyRequestNew: {
+            metrics_message_ptr->address =
+                metric_family->InitializeTritonMetricFamily();
+            break;
+          }
+          case PYTHONSTUB_MetricFamilyRequestDelete: {
+            metric_family->ClearTritonMetricFamily();
+            break;
+          }
+          default: {
+            throw PythonBackendException("Unknown metric family request kind");
+          }
+        }
+      });
+}
+
+void
+ModelInstanceState::ProcessMetricRequest(
+    const std::unique_ptr<IPCMessage>& message)
+{
+  auto command = message->Command();
+  ProcessMessage<Metric, CustomMetricsMessage>(
+      message, [this, command](
+                   std::unique_ptr<Metric>& metric,
+                   CustomMetricsMessage* metrics_message_ptr) {
+        try {
+          switch (command) {
+            case PYTHONSTUB_MetricRequestNew: {
+              metrics_message_ptr->address = metric->InitializeTritonMetric();
+              break;
+            }
+            case PYTHONSTUB_MetricRequestIncrement:
+            case PYTHONSTUB_MetricRequestSet:
+            case PYTHONSTUB_MetricRequestObserve:
+            case PYTHONSTUB_MetricRequestValue: {
+              metric->HandleMetricOperation(metrics_message_ptr, command);
+              break;
+            }
+            case PYTHONSTUB_MetricRequestDelete: {
+              metric->ClearTritonMetric();
+              break;
+            }
+            default: {
+              throw PythonBackendException("Unknown metric request kind");
+            }
+          }
+        }
+        catch (const PythonBackendException& exception) {
+          throw exception;
+        }
+      });
+}
+
+void
+ModelInstanceState::ProcessModelControlRequest(
+    const std::unique_ptr<IPCMessage>& message)
+{
+  auto command = message->Command();
+  ModelState* model_state = reinterpret_cast<ModelState*>(Model());
+  ProcessMessage<ModelLoader, ModelLoaderMessage>(
+      message, [this, command, model_state](
+                   std::unique_ptr<ModelLoader>& model_loader,
+                   ModelLoaderMessage* model_loader_msg_ptr) {
+        switch (command) {
+          case PYTHONSTUB_LoadModelRequest: {
+            model_loader->LoadModel(model_state->TritonServer());
+            break;
+          }
+          case PYTHONSTUB_UnloadModelRequest: {
+            model_loader->UnloadModel(model_state->TritonServer());
+            break;
+          }
+          case PYTHONSTUB_ModelReadinessRequest: {
+            model_loader_msg_ptr->is_model_ready =
+                model_loader->IsModelReady(model_state->TritonServer());
+            break;
+          }
+          default: {
+            throw PythonBackendException("Unknown model loader request kind");
+          }
+        }
+      });
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::SendMessageToStub(
+    bi::managed_external_buffer::handle_t message)
+{
+  bool success = false;
+  while (!success) {
+    uint64_t timeout_miliseconds = 1000;
+    {
+      boost::posix_time::ptime timeout =
+          boost::get_system_time() +
+          boost::posix_time::milliseconds(timeout_miliseconds);
+
+      bi::scoped_lock<bi::interprocess_mutex> lock(
+          *(Stub()->HealthMutex()), timeout);
+
+      // Check if lock has been acquired.
+      if (lock) {
+        Stub()->IpcControl()->stub_health = false;
+      } else {
+        // If it failed to obtain the lock, it means that the stub has been
+        // stuck or exited while holding the health mutex lock.
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex.");
+      }
+    }
+
+    Stub()->StubMessageQueue()->Push(
+        message, timeout_miliseconds /* duration ms */, success);
+
+    if (!success && !IsStubProcessAlive()) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL, "Stub process is not healthy.");
+    }
+  }
+
+  return nullptr;  // success
+}
+
+void
+ModelInstanceState::SendMessageAndReceiveResponse(
+    bi::managed_external_buffer::handle_t message,
+    bi::managed_external_buffer::handle_t& response,
+    std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses,
+    TRITONBACKEND_Request** requests, const uint32_t request_count)
+{
+  auto error = SendMessageToStub(message);
+  if (error != nullptr) {
+    RespondErrorToAllRequests(
+        TRITONSERVER_ErrorMessage(error), responses, requests, request_count);
+
+    return;
+  }
+
+  bi::managed_external_buffer::handle_t response_message;
+  error = Stub()->ReceiveMessageFromStub(response_message);
+  if (error != nullptr) {
+    RespondErrorToAllRequests(
+        TRITONSERVER_ErrorMessage(error), responses, requests, request_count);
+
+    return;
+  }
+
+  response = response_message;
+}
+
+void
+ModelInstanceState::RespondErrorToAllRequests(
+    const char* message,
+    std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses,
+    TRITONBACKEND_Request** requests, const uint32_t request_count)
+{
+  for (uint32_t r = 0; r < request_count; ++r) {
+    if ((*responses)[r] == nullptr)
+      continue;
+
+    std::string err_message =
+        std::string(
+            "Failed to process the request(s) for model instance '" + Name() +
+            "', message: ") +
+        message;
+
+    TRITONSERVER_Error* err =
+        TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message.c_str());
+    LOG_IF_ERROR(
+        TRITONBACKEND_ResponseSend(
+            (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
+        "failed sending response");
+
+    (*responses)[r] = nullptr;
+    TRITONSERVER_ErrorDelete(err);
+  }
+}
+
+
+void
+ModelInstanceState::StartMonitor()
+{
+  stub_to_parent_thread_ = true;
+  stub_to_parent_queue_monitor_ =
+      std::thread(&ModelInstanceState::StubToParentMQMonitor, this);
+}
+
+void
+ModelInstanceState::TerminateMonitor()
+{
+  if (stub_to_parent_thread_) {
+    stub_to_parent_thread_ = false;
+    // Push a dummy message to signal the thread to terminate.
+    Stub()->StubToParentMessageQueue()->Push(DUMMY_MESSAGE);
+    stub_to_parent_queue_monitor_.join();
+  }
+}
+
+void
+ModelInstanceState::ResponseSendDecoupled(
+    std::shared_ptr<IPCMessage> response_send_message)
+{
+  AllocatedSharedMemory<ResponseSendMessage> send_message =
+      Stub()->ShmPool()->Load<ResponseSendMessage>(
+          response_send_message->Args());
+
+  ResponseSendMessage* send_message_payload =
+      reinterpret_cast<ResponseSendMessage*>(send_message.data_.get());
+  std::unique_ptr<PbString> error_message;
+  ScopedDefer response_factory_deleter([send_message_payload] {
+    if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) {
+      TRITONBACKEND_ResponseFactory* response_factory =
+          reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
+              send_message_payload->response_factory_address);
+      std::unique_ptr<
+          TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter>
+          lresponse_factory(reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
+              response_factory));
+    }
+  });
+  ScopedDefer _([send_message_payload] {
+    {
+      bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
+      send_message_payload->is_stub_turn = true;
+      send_message_payload->cv.notify_all();
+
+      while (send_message_payload->is_stub_turn) {
+        send_message_payload->cv.wait(guard);
+      }
+    }
+  });
+
+  TRITONBACKEND_ResponseFactory* response_factory =
+      reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
+          send_message_payload->response_factory_address);
+  if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) {
+    {
+      std::lock_guard<std::mutex> guard{closed_requests_mutex_};
+      closed_requests_.push_back(send_message_payload->request_address);
+    }
+  }
+
+  if (send_message_payload->response != 0) {
+    std::unique_ptr<InferResponse> infer_response =
+        InferResponse::LoadFromSharedMemory(
+            Stub()->ShmPool(), send_message_payload->response,
+            false /* open cuda ipc handle */);
+
+    bool requires_deferred_callback = false;
+    TRITONBACKEND_Response* response;
+    SetErrorForResponseSendMessage(
+        send_message_payload,
+        WrapTritonErrorInSharedPtr(
+            TRITONBACKEND_ResponseNewFromFactory(&response, response_factory)),
+        error_message);
+
+    std::vector<std::pair<std::unique_ptr<PbMemory>, void*>> gpu_output_buffers;
+    GPUBuffersHelper gpu_buffer_helper;
+
+#ifdef TRITON_ENABLE_GPU
+    for (auto& output_tensor : infer_response->OutputTensors()) {
+      if (!output_tensor->IsCPU()) {
+        // Attempt to use the cuda shared memory pool for GPU tensor.
+        ShareCUDAMemoryPool(output_tensor->MemoryTypeId());
+      }
+    }
+#endif  // TRITON_ENABLE_GPU
+
+    infer_response->Send(
+        response, CudaStream(), requires_deferred_callback,
+        send_message_payload->flags, Stub()->ShmPool(), gpu_buffer_helper,
+        gpu_output_buffers);
+
+    if (requires_deferred_callback) {
+      gpu_buffer_helper.Complete(Stub()->ShmPool());
+      send_message_payload->gpu_buffers_handle = gpu_buffer_helper.ShmHandle();
+
+      // Additional round trip so that the stub can fill the GPU output buffers.
+      {
+        bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
+        send_message_payload->is_stub_turn = true;
+        send_message_payload->cv.notify_all();
+
+        while (send_message_payload->is_stub_turn) {
+          send_message_payload->cv.wait(guard);
+        }
+      }
+
+      bool cuda_copy = false;
+      for (auto& output_buffer_pair : gpu_output_buffers) {
+        auto& pb_memory = output_buffer_pair.first;
+        void* pointer = output_buffer_pair.second;
+        bool cuda_used;
+
+        try {
+          if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) {
+            THROW_IF_TRITON_ERROR(CopyBuffer(
+                "Failed to copy the CPU output tensor to buffer.",
+                TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0,
+                pb_memory->ByteSize(), pb_memory->DataPtr(), pointer,
+                CudaStream(), &cuda_used));
+            cuda_copy |= cuda_used;
+          } else if (
+              (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) &&
+              pb_memory->UseCUDASharedPool() &&
+              (pb_memory->DataPtr() != pointer)) {
+            // If the data pointer from pb_memory is not the same as the
+            // pointer, it means that the Triton-provided buffer is not used
+            // during tensor transfer. Instead, an intermediate buffer that uses
+            // CUDA shared memory pool is used. In this case, we need to copy
+            // the data from the intermediate buffer back to the Triton-provided
+            // buffer.
+            THROW_IF_TRITON_ERROR(CopyBuffer(
+                "Failed to copy the GPU output tensor to buffer.",
+                TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(),
+                TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(),
+                pb_memory->ByteSize(), pb_memory->DataPtr(), pointer,
+                CudaStream(), &cuda_used));
+            cuda_copy |= cuda_used;
+          }
+#ifdef TRITON_ENABLE_GPU
+          if (cuda_copy) {
+            cudaStreamSynchronize(stream_);
+          }
+#endif  // TRITON_ENABLE_GPU
+        }
+        catch (const PythonBackendException& pb_exception) {
+          TRITONSERVER_Error* error = TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INTERNAL,
+              (std::string(
+                   "Failed to copy output tensor to Triton-provided buffer: ") +
+               pb_exception.what())
+                  .c_str());
+          SetErrorForResponseSendMessage(
+              send_message_payload, WrapTritonErrorInSharedPtr(error),
+              error_message);
+        }
+      }
+    }
+  } else {
+    TRITONSERVER_Error* error = TRITONBACKEND_ResponseFactorySendFlags(
+        response_factory, send_message_payload->flags);
+    SetErrorForResponseSendMessage(
+        send_message_payload, WrapTritonErrorInSharedPtr(error), error_message);
+  }
+}
+
+TRITONSERVER_Error*
+ModelInstanceState::ProcessRequests(
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    std::vector<std::unique_ptr<InferRequest>>& pb_infer_requests,
+    PbMetricReporter& reporter)
+{
+  NVTX_RANGE(nvtx_, "ProcessRequests " + Name());
+  closed_requests_ = {};
+  ModelState* model_state = reinterpret_cast<ModelState*>(Model());
+
+  size_t total_batch_size = 0;
+  RETURN_IF_ERROR(
+      CheckIncomingRequests(requests, request_count, total_batch_size));
+
+  // No request to process
+  if (total_batch_size == 0) {
+    return nullptr;  // success
+  }
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("model ") + model_state->Name() + ", instance " + Name() +
+       ", executing " + std::to_string(request_count) + " requests")
+          .c_str());
+
+  AllocatedSharedMemory<char> request_batch;
+  std::shared_ptr<std::vector<TRITONBACKEND_Response*>> responses;
+
+  RETURN_IF_ERROR(SaveRequestsToSharedMemory(
+      requests, request_count, pb_infer_requests, request_batch, responses));
+
+  uint64_t compute_start_ns = 0;
+  SET_TIMESTAMP(compute_start_ns);
+  reporter.SetComputeStartNs(compute_start_ns);
+
+  std::unique_ptr<IPCMessage> ipc_message;
+  RETURN_IF_EXCEPTION(
+      ipc_message =
+          IPCMessage::Create(Stub()->ShmPool(), false /*inline_response*/));
+  ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest;
+  ipc_message->Args() = request_batch.handle_;
+
+  ScopedDefer execute_finalize([this] {
+    // Push a dummy message to signal the thread to terminate.
+    Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE);
+  });
+
+  std::unique_ptr<IPCMessage> response;
+  {
+    Stub()->StubMessageQueue()->Push(ipc_message->ShmHandle());
+    bi::managed_external_buffer::handle_t response_message;
+    RETURN_IF_ERROR(Stub()->ReceiveMessageFromStub(response_message));
+    response =
+        IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), response_message);
+  }
+  char* ipc_message_shm =
+      reinterpret_cast<char*>(response->GetAllocatedSharedMemory().data_.get());
+  ResponseBatch* response_batch_shm_ptr =
+      reinterpret_cast<ResponseBatch*>(ipc_message_shm + sizeof(IPCMessageShm));
+
+  uint64_t compute_end_ns = 0;
+  SET_TIMESTAMP(compute_end_ns);
+  reporter.SetComputeEndNs(compute_end_ns);
+  reporter.SetBatchStatistics(total_batch_size);
+
+  if (response_batch_shm_ptr->has_error) {
+    // Clean up the response factory if an error occurred. The
+    // `is_response_factory_deleted` flag indicates whether the response factory
+    // has been deleted for some corner cases.
+    if (!response_batch_shm_ptr->is_response_factory_deleted) {
+      for (uint32_t r = 0; r < request_count; r++) {
+        TRITONBACKEND_ResponseFactory* response_factory =
+            reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
+                pb_infer_requests[r]->GetResponseFactoryAddress());
+        std::unique_ptr<
+            TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter>
+            lresponse_factory(reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
+                response_factory));
+      }
+    }
+    if (response_batch_shm_ptr->is_error_set) {
+      auto error = PbString::LoadFromSharedMemory(
+          Stub()->ShmPool(), response_batch_shm_ptr->error);
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL, error->String().c_str());
+    }
+
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL, "Failed to process the requests.");
+  }
+
+  if (response_batch_shm_ptr->batch_size > 0) {
+    bi::managed_external_buffer::handle_t* response_shm_handle =
+        reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+            ipc_message_shm + sizeof(ResponseBatch) + sizeof(IPCMessageShm));
+
+    std::shared_ptr<std::vector<TRITONBACKEND_Response*>> responses(
+        new std::vector<TRITONBACKEND_Response*>());
+    responses->reserve(request_count);
+    for (size_t i = 0; i < request_count; i++) {
+      // It is possible to have multiple responses batched together in a single
+      // response batch shm, where some of the responses are None due to the
+      // usage of response sender, so only create a TRITONBACKEND_Response
+      // object for the valid responses.
+      if (response_shm_handle[i] == 0) {
+        responses->emplace_back(nullptr);
+      } else {
+        TRITONBACKEND_Response* response;
+        auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
+        if (err == nullptr) {
+          responses->emplace_back(response);
+        } else {
+          responses->emplace_back(nullptr);
+          LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
+          TRITONSERVER_ErrorDelete(err);
+        }
+      }
+    }
+
+    std::vector<bool> requires_deferred_callback;
+
+    bool has_gpu_output = false;
+    std::vector<std::unique_ptr<InferResponse>> shm_responses;
+    std::vector<std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>>
+        gpu_output_buffers(request_count);
+    GPUBuffersHelper gpu_buffer_helper;
+
+    for (uint32_t r = 0; r < request_count; ++r) {
+      NVTX_RANGE(nvtx_, "LoadingResponse " + Name());
+      requires_deferred_callback.push_back(false);
+      if (response_shm_handle[r] == 0) {
+        continue;
+      }
+      TRITONBACKEND_Response* response = (*responses)[r];
+      TRITONBACKEND_Request* request = requests[r];
+      uint32_t requested_output_count = 0;
+
+      shm_responses.emplace_back(nullptr);
+      std::unique_ptr<InferResponse>& infer_response = shm_responses.back();
+      try {
+        if (pb_infer_requests[r]->ReleaseFlags() ==
+            TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) {
+          // For rescheduled requests, we do not need to send a response.
+          LOG_IF_ERROR(
+              TRITONBACKEND_ResponseDelete((*responses)[r]),
+              "failed to delete response");
+          (*responses)[r] = nullptr;
+          continue;
+        }
+        {
+          TRITONBACKEND_ResponseFactory* response_factory =
+              reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
+                  pb_infer_requests[r]->GetResponseFactoryAddress());
+          std::unique_ptr<
+              TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter>
+              lresponse_factory(
+                  reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
+                      response_factory));
+        }
+        infer_response = InferResponse::LoadFromSharedMemory(
+            Stub()->ShmPool(), response_shm_handle[r],
+            false /* open_cuda_handle */);
+        if (infer_response->HasError()) {
+          TRITONSERVER_Error* err = TRITONSERVER_ErrorNew(
+              infer_response->Error()->Code(),
+              infer_response->Error()->Message().c_str());
+
+          LOG_IF_ERROR(
+              TRITONBACKEND_ResponseSend(
+                  (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
+              "failed sending response");
+          TRITONSERVER_ErrorDelete(err);
+          (*responses)[r] = nullptr;
+
+          // Reset the release flags for the request.
+          pb_infer_requests[r]->SetReleaseFlags(
+              TRITONSERVER_REQUEST_RELEASE_ALL);
+
+          // If has_error is true, we do not look at the response tensors.
+          continue;
+        }
+      }
+      catch (const PythonBackendException& pb_exception) {
+        TRITONSERVER_Error* err = TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL, pb_exception.what());
+        LOG_IF_ERROR(
+            TRITONBACKEND_ResponseSend(
+                (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
+            "failed sending response");
+        TRITONSERVER_ErrorDelete(err);
+        (*responses)[r] = nullptr;
+
+        // Reset the release flags for the request.
+        pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL);
+
+        continue;
+      }
+
+      GUARDED_RESPOND_IF_ERROR(
+          responses, r,
+          TRITONBACKEND_RequestOutputCount(request, &requested_output_count));
+      std::set<std::string> requested_output_names;
+      for (size_t j = 0; j < requested_output_count; ++j) {
+        const char* output_name;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r,
+            TRITONBACKEND_RequestOutputName(request, j, &output_name));
+        requested_output_names.insert(output_name);
+      }
+
+      bool require_deferred_callback = false;
+
+#ifdef TRITON_ENABLE_GPU
+      for (auto& output_tensor : infer_response->OutputTensors()) {
+        if (output_tensor->MemoryType() == TRITONSERVER_MEMORY_GPU) {
+          // Attempt to use the cuda shared memory pool for GPU tensor.
+          ShareCUDAMemoryPool(output_tensor->MemoryTypeId());
+        }
+      }
+#endif  // TRITON_ENABLE_GPU
+
+      gpu_output_buffers[r] =
+          std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>{};
+      infer_response->Send(
+          response, CudaStream(), require_deferred_callback,
+          TRITONSERVER_RESPONSE_COMPLETE_FINAL, Stub()->ShmPool(),
+          gpu_buffer_helper, gpu_output_buffers[r], requested_output_names);
+
+      requires_deferred_callback[r] = require_deferred_callback;
+
+      if (requires_deferred_callback[r]) {
+        has_gpu_output = true;
+      }
+    }
+
+    execute_finalize.Complete();
+
+    // If the output tensor is in GPU, there will be a second round trip
+    // required for filling the GPU buffers provided by the main process.
+    if (has_gpu_output) {
+      ipc_message->Command() =
+          PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers;
+      gpu_buffer_helper.Complete(Stub()->ShmPool());
+      ipc_message->Args() = gpu_buffer_helper.ShmHandle();
+      bi::managed_external_buffer::handle_t response_message;
+      SendMessageAndReceiveResponse(
+          ipc_message->ShmHandle(), response_message, responses, requests, 0);
+
+      bool cuda_copy = false;
+
+      uint32_t response_index = 0;
+      for (auto& gpu_output_buffer : gpu_output_buffers) {
+        for (auto& buffer_memory_pair : gpu_output_buffer) {
+          auto& pb_memory = buffer_memory_pair.first;
+          void* pointer = buffer_memory_pair.second;
+          bool cuda_used = false;
+
+          if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) {
+            GUARDED_RESPOND_IF_ERROR(
+                responses, response_index,
+                CopyBuffer(
+                    "Failed to copy the output tensor to buffer.",
+                    TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0,
+                    pb_memory->ByteSize(), pb_memory->DataPtr(), pointer,
+                    CudaStream(), &cuda_used));
+            cuda_copy |= cuda_used;
+          } else if (
+              (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) &&
+              pb_memory->UseCUDASharedPool() &&
+              (pb_memory->DataPtr() != pointer)) {
+            // If the data pointer from pb_memory is not the same as the
+            // pointer, it means that the Triton-provided buffer is not used
+            // during tensor transfer. Instead, an intermediate buffer that uses
+            // CUDA shared memory pool is used. In this case, we need to copy
+            // the data from the intermediate buffer back to the Triton-provided
+            // buffer.
+            GUARDED_RESPOND_IF_ERROR(
+                responses, response_index,
+                CopyBuffer(
+                    "Failed to copy the output tensor to buffer.",
+                    TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(),
+                    TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(),
+                    pb_memory->ByteSize(), pb_memory->DataPtr(), pointer,
+                    CudaStream(), &cuda_used));
+            cuda_copy |= cuda_used;
+          }
+        }
+        response_index++;
+#ifdef TRITON_ENABLE_GPU
+        if (cuda_copy) {
+          cudaStreamSynchronize(stream_);
+        }
+#endif  // TRITON_ENABLE_GPU
+      }
+    }
+
+    for (uint32_t r = 0; r < request_count; ++r) {
+      if (requires_deferred_callback[r]) {
+        shm_responses[r]->DeferredSendCallback();
+      }
+    }
+  }
+
+  return nullptr;  // success
+}
+
+void
+ModelInstanceState::PrepareResponseBatch(
+    ResponseBatch** response_batch,
+    AllocatedSharedMemory<char>& response_batch_shm,
+    std::unique_ptr<IPCMessage>* ipc_message,
+    bi::managed_external_buffer::handle_t** response_handle)
+{
+  response_batch_shm = Stub()->ShmPool()->Construct<char>(
+      sizeof(ResponseBatch) + sizeof(bi::managed_external_buffer::handle_t));
+  *response_batch =
+      reinterpret_cast<ResponseBatch*>(response_batch_shm.data_.get());
+  (*ipc_message)->Args() = response_batch_shm.handle_;
+
+  *response_handle = reinterpret_cast<bi::managed_external_buffer::handle_t*>(
+      response_batch_shm.data_.get() + sizeof(ResponseBatch));
+
+  (*response_batch)->batch_size = 1;
+  (*response_batch)->has_error = false;
+  (*response_batch)->is_error_set = false;
+  (*response_batch)->cleanup = false;
+  (*response_batch)->response_size = 1;
+}
+
+void
+ModelInstanceState::PrepareResponseHandle(
+    std::unique_ptr<InferResponse>* infer_response,
+    bi::managed_external_buffer::handle_t* response_handle)
+{
+#ifdef TRITON_ENABLE_GPU
+  for (auto& output_tensor : (*infer_response)->OutputTensors()) {
+    if (!output_tensor->IsCPU()) {
+      // Attempt to use the cuda shared memory pool for GPU tensor.
+      ShareCUDAMemoryPool(output_tensor->MemoryTypeId());
+      // It's possible that the CUDA memory pool offset isn't set correctly,
+      // even if the BLS output is using CUDA memory. This can occur when the
+      // CUDA memory pool hasn't been shared with the stub process at the time
+      // the BLS output is allocated during the ResponseAlloc callback. In such
+      // cases, we need to adjust the CUDA pool offset accordingly.
+      if (!output_tensor->Memory()->UseCUDASharedPool()) {
+        output_tensor->Memory()->UpdateCUDAOffset(
+            Stub()->ShmPool()->GetCUDAMemoryPoolManager());
+      }
+    }
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  (*infer_response)->SaveToSharedMemory(Stub()->ShmPool());
+
+  for (auto& output_tensor : (*infer_response)->OutputTensors()) {
+    if (!output_tensor->IsCPU()) {
+#ifdef TRITON_ENABLE_GPU
+      std::unique_ptr<MemoryRecord> memory_record;
+      // Need to transfer the ownership of the BackendMemory to the
+      // MemoryManager so that the lifetime of the BackendMemory is managed.
+      memory_record = std::make_unique<BackendMemoryRecord>(
+          output_tensor->Memory()->GetBackendMemory());
+      uint64_t memory_release_id =
+          Stub()->GetMemoryManager()->AddRecord(std::move(memory_record));
+      output_tensor->Memory()->SetMemoryReleaseId(memory_release_id);
+#endif
+    }
+  }
+
+  *response_handle = (*infer_response)->ShmHandle();
+}
+
+void
+ModelInstanceState::SendBLSDecoupledResponse(
+    std::unique_ptr<InferResponse> infer_response)
+{
+  bool is_response_batch_set = false;
+  ResponseBatch* response_batch = nullptr;
+  std::unique_ptr<PbString> pb_error_message;
+  std::unique_ptr<IPCMessage> ipc_message;
+  AllocatedSharedMemory<char> response_batch_shm;
+  bi::managed_external_buffer::handle_t* response_handle = nullptr;
+
+  try {
+    ipc_message =
+        IPCMessage::Create(Stub()->ShmPool(), true /* inline_response */);
+    ipc_message->Args() = response_batch_shm.handle_;
+    ipc_message->Command() = PYTHONSTUB_InferStreamExecResponse;
+    PrepareResponseBatch(
+        &response_batch, response_batch_shm, &ipc_message, &response_handle);
+    is_response_batch_set = true;
+    response_batch->waiting_on_stub = false;
+    PrepareResponseHandle(&infer_response, response_handle);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    if (is_response_batch_set) {
+      response_batch->has_error = true;
+      LOG_IF_EXCEPTION(
+          pb_error_message =
+              PbString::Create(Stub()->ShmPool(), pb_exception.what()));
+
+      if (pb_error_message != nullptr) {
+        response_batch->is_error_set = true;
+        response_batch->error = pb_error_message->ShmHandle();
+      }
+    } else {
+      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, pb_exception.what());
+    }
+  }
+
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lock{
+        *(ipc_message->ResponseMutex())};
+    Stub()->ParentToStubMessageQueue()->Push(ipc_message->ShmHandle());
+    while (!response_batch->waiting_on_stub) {
+      ipc_message->ResponseCondition()->wait(lock);
+    }
+  }
+}
+
+void
+ModelInstanceState::ShareCUDAMemoryPool(const int32_t device_id)
+{
+#ifdef TRITON_ENABLE_GPU
+  try {
+    Stub()->ShareCUDAMemoryPool(Model()->TritonMemoryManager(), device_id);
+  }
+  catch (const PythonBackendException& ex) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_WARN,
+        (std::string("Failed to share CUDA memory pool with stub process: ") +
+         ex.what() + ". Will use CUDA IPC.")
+            .c_str());
+  }
+#endif  // TRITON_ENABLE_GPU
+}
+
+ModelInstanceState::~ModelInstanceState()
+{
+  Stub()->UpdateHealth();
+  if (Stub()->IsHealthy()) {
+    // Wait for all the pending tasks to finish.
+    thread_pool_->wait();
+  }
+  // Terminate stub first to allow any last messages to be received by the back
+  // end before deallocating the queue memory
+  Stub()->TerminateStub();
+  TerminateMonitor();
+  Stub()->ClearQueues();
+  Stub().reset();
+}
+
+TRITONSERVER_Error*
+ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+{
+  try {
+    *state = new ModelState(triton_model);
+  }
+  catch (const BackendModelException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  // Auto-complete the configuration if requested...
+  bool auto_complete_config = false;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(
+      triton_model, &auto_complete_config));
+  if (auto_complete_config) {
+    RETURN_IF_ERROR((*state)->LaunchAutoCompleteStubProcess());
+    (*state)->ModelConfig() = std::move((*state)->Stub()->AutoCompleteConfig());
+    RETURN_IF_ERROR((*state)->SetModelConfig());
+
+    (*state)->Stub()->UpdateHealth();
+    (*state)->Stub()->TerminateStub();
+    (*state)->Stub()->ClearQueues();
+    (*state)->Stub().reset();
+  }
+
+  RETURN_IF_ERROR((*state)->ValidateModelConfig());
+
+  return nullptr;  // success
+}
+
+ModelState::ModelState(TRITONBACKEND_Model* triton_model)
+    : BackendModel(triton_model, true /* allow_optional */)
+{
+  TRITONBACKEND_Backend* backend;
+  THROW_IF_BACKEND_MODEL_ERROR(
+      TRITONBACKEND_ModelBackend(triton_model, &backend));
+
+  const char* path = nullptr;
+  TRITONBACKEND_ArtifactType artifact_type;
+  THROW_IF_BACKEND_MODEL_ERROR(
+      TRITONBACKEND_ModelRepository(triton_model, &artifact_type, &path));
+  python_execution_env_ = "";
+  force_cpu_only_input_tensors_ = true;
+  decoupled_ = false;
+
+  void* bstate;
+  THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_BackendState(backend, &bstate));
+  backend_state_ = reinterpret_cast<BackendState*>(bstate);
+
+  runtime_modeldir_ = backend_state_->runtime_modeldir;
+  triton::common::TritonJson::Value params;
+  common::TritonJson::Value model_config;
+  if (model_config_.Find("parameters", &params)) {
+    // Skip the EXECUTION_ENV_PATH variable if it doesn't exist.
+    TRITONSERVER_Error* error =
+        GetParameterValue(params, "EXECUTION_ENV_PATH", &python_execution_env_);
+    if (error == nullptr) {
+      std::string relative_path_keyword = "$$TRITON_MODEL_DIRECTORY";
+      size_t relative_path_loc =
+          python_execution_env_.find(relative_path_keyword);
+      if (relative_path_loc != std::string::npos) {
+        python_execution_env_.replace(
+            relative_path_loc, relative_path_loc + relative_path_keyword.size(),
+            path);
+      }
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Using Python execution env ") + python_execution_env_)
+              .c_str());
+    } else {
+      // Delete the error
+      TRITONSERVER_ErrorDelete(error);
+    }
+
+    triton::common::TritonJson::Value model_transaction_policy;
+    if (model_config_.Find(
+            "model_transaction_policy", &model_transaction_policy)) {
+      triton::common::TritonJson::Value decoupled;
+      if (model_transaction_policy.Find("decoupled", &decoupled)) {
+        auto error = decoupled.AsBool(&decoupled_);
+        if (error != nullptr) {
+          throw BackendModelException(error);
+        }
+      }
+    }
+
+    // Skip the FORCE_CPU_ONLY_INPUT_TENSORS variable if it doesn't exits.
+    std::string force_cpu_only_input_tensor;
+    error = nullptr;
+    error = GetParameterValue(
+        params, "FORCE_CPU_ONLY_INPUT_TENSORS", &force_cpu_only_input_tensor);
+    if (error == nullptr) {
+      if (force_cpu_only_input_tensor == "yes") {
+        force_cpu_only_input_tensors_ = true;
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("Forcing CPU only input tensors.")).c_str());
+      } else if (force_cpu_only_input_tensor == "no") {
+        force_cpu_only_input_tensors_ = false;
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("Input tensors can be both in CPU and GPU. "
+                         "FORCE_CPU_ONLY_INPUT_TENSORS is off."))
+                .c_str());
+      } else {
+        throw BackendModelException(TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            (std::string("Incorrect value for FORCE_CPU_ONLY_INPUT_TENSORS: ") +
+             force_cpu_only_input_tensor + "'")
+                .c_str()));
+      }
+    } else {
+      // Delete the error
+      TRITONSERVER_ErrorDelete(error);
+    }
+  }
+
+  if (artifact_type != TRITONBACKEND_ARTIFACT_FILESYSTEM) {
+    throw BackendModelException(TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_UNSUPPORTED,
+        (std::string("unsupported artifact type for model '") + Name() + "'")
+            .c_str()));
+  }
+}
+
+TRITONSERVER_Error*
+ModelState::LaunchAutoCompleteStubProcess()
+{
+  Stub() = std::make_unique<StubLauncher>("AUTOCOMPLETE_STUB");
+  RETURN_IF_ERROR(Stub()->Initialize(this));
+  try {
+    RETURN_IF_ERROR(Stub()->Setup());
+    RETURN_IF_ERROR(Stub()->Launch());
+  }
+  catch (const BackendModelException& ex) {
+    Stub()->UpdateHealth();
+    Stub()->TerminateStub();
+    Stub()->ClearQueues();
+    Stub().reset();
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelState::ValidateModelConfig()
+{
+  // We have the json DOM for the model configuration...
+  triton::common::TritonJson::WriteBuffer buffer;
+  RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer));
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("model configuration:\n") + buffer.Contents()).c_str());
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+ModelState::SetModelConfig()
+{
+  BackendModel::SetModelConfig();
+  // `Update model_transaction_policy` if setting was set
+  // with `set_model_transaction_policy`
+  triton::common::TritonJson::Value model_transaction_policy;
+  bool is_decoupled = false;
+  if (ModelConfig().Find(
+          "model_transaction_policy", &model_transaction_policy)) {
+    triton::common::TritonJson::Value decoupled;
+    if (model_transaction_policy.Find("decoupled", &decoupled)) {
+      auto error = decoupled.AsBool(&is_decoupled);
+      if (error != nullptr) {
+        throw BackendModelException(error);
+      }
+      SetDecoupled(is_decoupled);
+    }
+  }
+
+  return nullptr;
+}
+
+
+extern "C" {
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
+  std::string name(cname);
+
+  // Check backend version to ensure compatibility
+  uint32_t api_version_major, api_version_minor;
+  RETURN_IF_ERROR(
+      TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("'") + name + "' TRITONBACKEND API version: " +
+       std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
+       std::to_string(TRITONBACKEND_API_VERSION_MINOR))
+          .c_str());
+
+  if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
+      (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_UNSUPPORTED,
+        "Triton backend API version does not support this backend");
+  }
+
+  TRITONSERVER_Message* backend_config_message;
+  RETURN_IF_ERROR(
+      TRITONBACKEND_BackendConfig(backend, &backend_config_message));
+
+  const char* buffer;
+  size_t byte_size;
+  RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(
+      backend_config_message, &buffer, &byte_size));
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("backend configuration:\n") + buffer).c_str());
+
+  triton::common::TritonJson::Value backend_config;
+  if (byte_size != 0) {
+    RETURN_IF_ERROR(backend_config.Parse(buffer, byte_size));
+  }
+
+  std::unique_ptr<BackendState> backend_state(new BackendState());
+  triton::common::TritonJson::Value cmdline;
+  backend_state->shm_default_byte_size = 1 * 1024 * 1024;  // 1 MB
+  backend_state->shm_growth_byte_size = 1 * 1024 * 1024;   // 1 MB
+  backend_state->stub_timeout_seconds = 30;
+  backend_state->shm_message_queue_size = 1000;
+  backend_state->thread_pool_size = 32;
+  // Initialize shared memory region prefix to include backend's name
+  // to avoid collision between python backend and python-based backends.
+  backend_state->shared_memory_region_prefix =
+      "triton_" + name + "_backend_shm_region_";
+  std::string default_backend_dir_string;
+
+  if (backend_config.Find("cmdline", &cmdline)) {
+    triton::common::TritonJson::Value shm_growth_size;
+    std::string shm_growth_byte_size;
+    if (cmdline.Find("shm-growth-byte-size", &shm_growth_size)) {
+      RETURN_IF_ERROR(shm_growth_size.AsString(&shm_growth_byte_size));
+      try {
+        backend_state->shm_growth_byte_size = std::stol(shm_growth_byte_size);
+        if (backend_state->shm_growth_byte_size <= 0) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string("shm-growth-byte-size") +
+               " can't be smaller than or equal to zero.")
+                  .c_str());
+        }
+      }
+      catch (const std::invalid_argument& ia) {
+        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what());
+      }
+    }
+
+    triton::common::TritonJson::Value shm_default_size;
+    std::string shm_default_byte_size;
+    if (cmdline.Find("shm-default-byte-size", &shm_default_size)) {
+      RETURN_IF_ERROR(shm_default_size.AsString(&shm_default_byte_size));
+      try {
+        backend_state->shm_default_byte_size = std::stol(shm_default_byte_size);
+        // Shared memory default byte size can't be less than 1 MB.
+        if (backend_state->shm_default_byte_size < 1 * 1024 * 1024) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string("shm-default-byte-size") +
+               " can't be smaller than 4 MiBs")
+                  .c_str());
+        }
+      }
+      catch (const std::invalid_argument& ia) {
+        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what());
+      }
+    }
+
+    triton::common::TritonJson::Value thread_pool_size;
+    std::string thread_pool_count;
+    if (cmdline.Find("thread-pool-size", &thread_pool_size)) {
+      RETURN_IF_ERROR(thread_pool_size.AsString(&thread_pool_count));
+      try {
+        backend_state->thread_pool_size = std::stol(thread_pool_count);
+        // Shared memory default byte size can't be less than 4 MBs.
+        if (backend_state->thread_pool_size < 1) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string("thread-pool-size") + " can't be less than 1.")
+                  .c_str());
+        }
+      }
+      catch (const std::invalid_argument& ia) {
+        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what());
+      }
+    }
+
+    triton::common::TritonJson::Value shm_region_prefix;
+    std::string shm_region_prefix_str;
+    if (cmdline.Find("shm-region-prefix-name", &shm_region_prefix)) {
+      RETURN_IF_ERROR(shm_region_prefix.AsString(&shm_region_prefix_str));
+      // Shared memory default byte size can't be less than 4 MBs.
+      if (shm_region_prefix_str.size() == 0) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string("shm-region-prefix-name") +
+             " must at least contain one character.")
+                .c_str());
+      }
+      backend_state->shared_memory_region_prefix = shm_region_prefix_str;
+    }
+
+    triton::common::TritonJson::Value shm_message_queue_size;
+    std::string shm_message_queue_size_str;
+    if (cmdline.Find("shm_message_queue_size", &shm_message_queue_size)) {
+      RETURN_IF_ERROR(
+          shm_message_queue_size.AsString(&shm_message_queue_size_str));
+      try {
+        backend_state->shm_message_queue_size =
+            std::stol(shm_message_queue_size_str);
+      }
+      catch (const std::invalid_argument& ia) {
+        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what());
+      }
+    }
+
+    triton::common::TritonJson::Value stub_timeout_seconds;
+    std::string stub_timeout_string_seconds;
+    if (cmdline.Find("stub-timeout-seconds", &stub_timeout_seconds)) {
+      RETURN_IF_ERROR(
+          stub_timeout_seconds.AsString(&stub_timeout_string_seconds));
+      try {
+        backend_state->stub_timeout_seconds =
+            std::stol(stub_timeout_string_seconds);
+        if (backend_state->stub_timeout_seconds <= 0) {
+          return TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              (std::string("stub-timeout-seconds") +
+               " can't be smaller than or equal to zero.")
+                  .c_str());
+        }
+      }
+      catch (const std::invalid_argument& ia) {
+        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what());
+      }
+    }
+
+    triton::common::TritonJson::Value default_backend_dir;
+    if (cmdline.Find("backend-directory", &default_backend_dir)) {
+      RETURN_IF_ERROR(
+          default_backend_dir.AsString(&default_backend_dir_string));
+    }
+  }
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("Shared memory configuration is shm-default-byte-size=") +
+       std::to_string(backend_state->shm_default_byte_size) +
+       ",shm-growth-byte-size=" +
+       std::to_string(backend_state->shm_growth_byte_size) +
+       ",stub-timeout-seconds=" +
+       std::to_string(backend_state->stub_timeout_seconds))
+          .c_str());
+
+  // Use BackendArtifacts to determine the location of Python files
+  const char* clocation;
+  TRITONBACKEND_ArtifactType artifact_type;
+  RETURN_IF_ERROR(
+      TRITONBACKEND_BackendArtifacts(backend, &artifact_type, &clocation));
+
+  const char os_slash = std::filesystem::path::preferred_separator;
+  std::string location(clocation);
+#ifdef _WIN32
+  const std::string stub_executable_name = "triton_python_backend_stub.exe";
+  SanitizePath(location);
+  SanitizePath(default_backend_dir_string);
+#else
+  const std::string stub_executable_name = "triton_python_backend_stub";
+#endif
+  // Check if `triton_python_backend_stub` and `triton_python_backend_utils.py`
+  // are located under `location`.
+  std::string default_python_backend_dir =
+      default_backend_dir_string + os_slash + "python";
+  std::string backend_stub_path = location + os_slash + stub_executable_name;
+  std::string backend_utils =
+      location + os_slash + "triton_python_backend_utils.py";
+  // Both, stub and utils should be in the same location
+  if (FileExists(backend_stub_path) && FileExists(backend_utils)) {
+    backend_state->python_lib = location;
+    // If `location` is default location of a python backend,
+    // then we are using default python backend.
+    if (default_python_backend_dir == location) {
+      backend_state->runtime_modeldir = "";
+    } else {
+      // If `location` is not default location of a python backend,
+      // then we are using a python backend based backend and model.py stored
+      // in the received location.
+      backend_state->runtime_modeldir = location;
+    }
+  } else {
+    // If stub and utils are not found in received `location`,
+    // then we are using a python backend based backend and stub and utils are
+    // stored in the default python backend location.
+    if (!default_backend_dir_string.empty()) {
+      std::string backend_stub_path = default_backend_dir_string + os_slash +
+                                      "python" + os_slash +
+                                      stub_executable_name;
+      if (!FileExists(backend_stub_path)) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_NOT_FOUND,
+            (stub_executable_name + " is not found. Searched paths: " +
+             default_backend_dir_string + os_slash + "python and " + location)
+                .c_str());
+      }
+    }
+    backend_state->runtime_modeldir = location;
+    backend_state->python_lib =
+        default_backend_dir_string + os_slash + "python";
+  }
+// FIXME [DLIS-5969]: Enable for Windows when custom execution environments
+// are supported.
+#ifndef _WIN32
+  backend_state->env_manager = std::make_unique<EnvironmentManager>();
+#endif
+
+  RETURN_IF_ERROR(TRITONBACKEND_BackendSetState(
+      backend, reinterpret_cast<void*>(backend_state.get())));
+
+  backend_state.release();
+  return nullptr;
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend)
+{
+  LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "TRITONBACKEND_Finalize: Start");
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &vstate));
+  auto backend_state = reinterpret_cast<BackendState*>(vstate);
+  delete backend_state;
+  LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "TRITONBACKEND_Finalize: End");
+  return nullptr;  // success
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
+  std::string name(cname);
+
+  uint64_t version;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
+
+  TRITONSERVER_LogMessage(
+      TRITONSERVER_LOG_VERBOSE, __FILE__, __LINE__,
+      (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " +
+       std::to_string(version) + ")")
+          .c_str());
+
+  TRITONBACKEND_Backend* backend;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelBackend(model, &backend));
+
+  ModelState* model_state;
+  RETURN_IF_ERROR(ModelState::Create(model, &model_state));
+  RETURN_IF_ERROR(
+      TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
+
+  return nullptr;
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
+{
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
+  ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      "TRITONBACKEND_ModelFinalize: delete model state");
+
+  delete model_state;
+
+  return nullptr;
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
+  std::string name(cname);
+
+  int32_t device_id;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id));
+  TRITONSERVER_InstanceGroupKind kind;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind));
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" +
+       TRITONSERVER_InstanceGroupKindString(kind) + " device " +
+       std::to_string(device_id) + ")")
+          .c_str());
+
+  TRITONBACKEND_Model* model;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
+
+  void* vmodelstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
+  ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
+
+  ModelInstanceState* instance_state;
+  RETURN_IF_ERROR(
+      ModelInstanceState::Create(model_state, instance, &instance_state));
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
+      instance, reinterpret_cast<void*>(instance_state)));
+
+  RETURN_IF_ERROR(instance_state->LaunchStubProcess());
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("TRITONBACKEND_ModelInstanceInitialize: instance "
+                   "initialization successful ") +
+       name + " (device " + std::to_string(device_id) + ")")
+          .c_str());
+
+  return nullptr;
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceExecute(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+    const uint32_t request_count)
+{
+  ModelInstanceState* instance_state;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
+      instance, reinterpret_cast<void**>(&instance_state)));
+
+  TRITONSERVER_Error* error = nullptr;
+
+  // If restart is equal to true, it indicates that the stub process is
+  // unhealthy and needs a restart.
+  // TODO: Implement restart on decoupled
+
+  std::vector<std::unique_ptr<InferRequest>> infer_requests;
+  {
+    uint64_t exec_start_ns = 0;
+    SET_TIMESTAMP(exec_start_ns);
+
+    PbMetricReporter reporter(
+        instance_state->TritonModelInstance(), requests, request_count,
+        nullptr);
+    reporter.SetExecStartNs(exec_start_ns);
+
+    error = instance_state->ProcessRequests(
+        requests, request_count, infer_requests, reporter);
+
+    uint64_t exec_end_ns = 0;
+    SET_TIMESTAMP(exec_end_ns);
+    reporter.SetExecEndNs(exec_end_ns);
+
+    if (error != nullptr) {
+      reporter.SetSuccessStatus(false);
+      for (uint32_t r = 0; r < request_count; ++r) {
+        TRITONBACKEND_Request* request = requests[r];
+        if (!instance_state->ExistsInClosedRequests(
+                reinterpret_cast<intptr_t>(request))) {
+          TRITONBACKEND_Response* response = nullptr;
+          LOG_IF_ERROR(
+              TRITONBACKEND_ResponseNew(&response, request),
+              "Failed to create a new response.");
+
+          if (response != nullptr) {
+            LOG_IF_ERROR(
+                TRITONBACKEND_ResponseSend(
+                    response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, error),
+                "Failed to send the error response.");
+          }
+        }
+      }
+
+      for (auto& infer_request : infer_requests) {
+        // Reset the release flags for all the requests.
+        infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL);
+      }
+    }
+  }
+
+  // The InferRequest object might not be created if an error occurs. Explicitly
+  // update the release flags here based on the number of InferRequest objects.
+  std::vector<uint32_t> request_release_flags(
+      request_count, TRITONSERVER_REQUEST_RELEASE_ALL);
+  for (size_t i = 0; i < infer_requests.size(); ++i) {
+    request_release_flags[i] = infer_requests[i]->ReleaseFlags();
+  }
+
+  for (uint32_t r = 0; r < request_count; ++r) {
+    TRITONBACKEND_Request* request = requests[r];
+    try {
+      THROW_IF_TRITON_ERROR(
+          TRITONBACKEND_RequestRelease(request, request_release_flags[r]));
+    }
+    catch (const PythonBackendException& pb_exception) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (std::string("Failed to release request: ") + pb_exception.what())
+              .c_str());
+      if (request_release_flags[r] == TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) {
+        // If error occurs during request rescheduling, release the request with
+        // `TRITONSERVER_REQUEST_RELEASE_ALL` flag.
+        LOG_IF_ERROR(
+            TRITONBACKEND_RequestRelease(
+                request, TRITONSERVER_REQUEST_RELEASE_ALL),
+            "Failed to release request.");
+      }
+    }
+  }
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      (std::string("TRITONBACKEND_ModelInstanceExecute: model instance name ") +
+       instance_state->Name() + " released " + std::to_string(request_count) +
+       " requests")
+          .c_str());
+
+  return nullptr;
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
+{
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
+  ModelInstanceState* instance_state =
+      reinterpret_cast<ModelInstanceState*>(vstate);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
+
+  delete instance_state;
+
+  return nullptr;
+}
+
+TRITONBACKEND_ISPEC TRITONSERVER_Error*
+TRITONBACKEND_GetBackendAttribute(
+    TRITONBACKEND_Backend* backend,
+    TRITONBACKEND_BackendAttribute* backend_attributes)
+{
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      "TRITONBACKEND_GetBackendAttribute: setting attributes");
+  // Specify different preferred instance kind based on backend compatibility,
+  // so Triton core won't blindly auto-complete kind that may not be supported.
+  // Other instance groups setting are set to "no value" so that Triton core
+  // will auto-complete them with default policy.
+#ifdef TRITON_ENABLE_GPU
+  RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
+      backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_GPU, 0, nullptr, 0));
+#else
+  RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
+      backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_CPU, 0, nullptr, 0));
+#endif
+
+  // This backend can safely handle parallel calls to
+  // TRITONBACKEND_ModelInstanceInitialize (thread-safe).
+  RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeSetParallelModelInstanceLoading(
+      backend_attributes, true));
+
+  return nullptr;
+}
+
+}  // extern "C"
+}}}  // namespace triton::backend::python
diff --git a/src/python_be.h b/src/python_be.h
new file mode 100644
index 00000000..6082c50b
--- /dev/null
+++ b/src/python_be.h
@@ -0,0 +1,433 @@
+// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <array>
+#include <atomic>
+#include <boost/asio.hpp>
+#include <boost/asio/post.hpp>
+#include <boost/asio/thread_pool.hpp>
+#include <boost/functional/hash.hpp>
+#include <boost/interprocess/sync/interprocess_condition.hpp>
+#include <boost/interprocess/sync/interprocess_mutex.hpp>
+#include <boost/interprocess/sync/scoped_lock.hpp>
+#include <boost/thread/thread_time.hpp>
+#include <chrono>
+#include <csignal>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <functional>
+#include <future>
+#include <memory>
+#include <numeric>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "infer_request.h"
+#include "infer_response.h"
+#include "ipc_message.h"
+#include "memory_manager.h"
+#include "message_queue.h"
+#include "metric.h"
+#include "metric_family.h"
+#include "pb_env.h"
+#include "pb_map.h"
+#include "pb_metric_reporter.h"
+#include "pb_utils.h"
+#include "request_executor.h"
+#include "scoped_defer.h"
+#include "shm_manager.h"
+#include "stub_launcher.h"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/common/nvtx.h"
+#include "triton/common/triton_json.h"
+#include "triton/core/tritonbackend.h"
+#include "triton/core/tritonserver.h"
+
+#ifdef _WIN32
+#define NOMINMAX
+#include <windows.h>
+#else
+#include <sys/wait.h>
+#include <unistd.h>
+#endif
+
+#define LOG_IF_EXCEPTION(X)                                     \
+  do {                                                          \
+    try {                                                       \
+      (X);                                                      \
+    }                                                           \
+    catch (const PythonBackendException& pb_exception) {        \
+      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, pb_exception.what()); \
+    }                                                           \
+  } while (false)
+
+#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X)      \
+  do {                                                                      \
+    TRITONSERVER_Error* raasnie_err__ = (X);                                \
+    if (raasnie_err__ != nullptr) {                                         \
+      for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) {               \
+        if ((*RESPONSES)[ridx] != nullptr) {                                \
+          LOG_IF_ERROR(                                                     \
+              TRITONBACKEND_ResponseSend(                                   \
+                  (*RESPONSES)[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
+                  raasnie_err__),                                           \
+              "failed to send error response");                             \
+          (*RESPONSES)[ridx] = nullptr;                                     \
+        }                                                                   \
+      }                                                                     \
+      TRITONSERVER_ErrorDelete(raasnie_err__);                              \
+      return;                                                               \
+    }                                                                       \
+  } while (false)
+
+
+#define RESPOND_ALL_AND_RETURN_IF_EXCEPTION(RESPONSES, RESPONSES_COUNT, X)  \
+  do {                                                                      \
+    try {                                                                   \
+      (X);                                                                  \
+    }                                                                       \
+    catch (const PythonBackendException& exception) {                       \
+      TRITONSERVER_Error* raarie_err__ = TRITONSERVER_ErrorNew(             \
+          TRITONSERVER_ERROR_INTERNAL, exception.what());                   \
+      for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) {               \
+        if ((*RESPONSES)[ridx] != nullptr) {                                \
+          LOG_IF_ERROR(                                                     \
+              TRITONBACKEND_ResponseSend(                                   \
+                  (*RESPONSES)[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
+                  raarie_err__),                                            \
+              "failed to send error response");                             \
+          (*RESPONSES)[ridx] = nullptr;                                     \
+        }                                                                   \
+      }                                                                     \
+      TRITONSERVER_ErrorDelete(raarie_err__);                               \
+      return;                                                               \
+    }                                                                       \
+  } while (false)
+
+#define RESPOND_AND_RETURN_IF_ERROR(REQUEST, X)                         \
+  do {                                                                  \
+    TRITONSERVER_Error* rarie_err__ = (X);                              \
+    if (rarie_err__ != nullptr) {                                       \
+      TRITONBACKEND_Response* rarie_response__ = nullptr;               \
+      LOG_IF_ERROR(                                                     \
+          TRITONBACKEND_ResponseNew(&rarie_response__, REQUEST),        \
+          "failed to create response");                                 \
+      if (rarie_response__ != nullptr) {                                \
+        LOG_IF_ERROR(                                                   \
+            TRITONBACKEND_ResponseSend(                                 \
+                rarie_response__, TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
+                rarie_err__),                                           \
+            "failed to send error response");                           \
+      }                                                                 \
+      return rarie_err__;                                               \
+    }                                                                   \
+  } while (false)
+
+#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X)                      \
+  do {                                                                   \
+    if ((*RESPONSES)[IDX] != nullptr) {                                  \
+      TRITONSERVER_Error* err__ = (X);                                   \
+      if (err__ != nullptr) {                                            \
+        LOG_IF_ERROR(                                                    \
+            TRITONBACKEND_ResponseSend(                                  \
+                (*RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
+                err__),                                                  \
+            "failed to send error response");                            \
+        (*RESPONSES)[IDX] = nullptr;                                     \
+        TRITONSERVER_ErrorDelete(err__);                                 \
+      }                                                                  \
+    }                                                                    \
+  } while (false)
+
+#define GUARDED_RESPOND_IF_EXCEPTION(RESPONSES, IDX, X)                  \
+  do {                                                                   \
+    if ((*RESPONSES)[IDX] != nullptr) {                                  \
+      try {                                                              \
+        (X);                                                             \
+      }                                                                  \
+      catch (const PythonBackendException& pb_exception) {               \
+        TRITONSERVER_Error* err__ = TRITONSERVER_ErrorNew(               \
+            TRITONSERVER_ERROR_INTERNAL, pb_exception.what());           \
+        LOG_IF_ERROR(                                                    \
+            TRITONBACKEND_ResponseSend(                                  \
+                (*RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
+                err__),                                                  \
+            "failed to send error response");                            \
+        (*RESPONSES)[IDX] = nullptr;                                     \
+        TRITONSERVER_ErrorDelete(err__);                                 \
+      }                                                                  \
+    }                                                                    \
+  } while (false)
+
+#define RETURN_IF_EXCEPTION(X)                                 \
+  do {                                                         \
+    try {                                                      \
+      (X);                                                     \
+    }                                                          \
+    catch (const PythonBackendException& pb_exception) {       \
+      TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \
+          TRITONSERVER_ERROR_INTERNAL, pb_exception.what());   \
+      return rarie_err__;                                      \
+    }                                                          \
+  } while (false)
+
+namespace triton { namespace backend { namespace python {
+
+namespace bi = boost::interprocess;
+
+struct BackendState {
+  std::string python_lib;
+  int64_t shm_default_byte_size;
+  int64_t shm_growth_byte_size;
+  int64_t stub_timeout_seconds;
+  int64_t shm_message_queue_size;
+  std::atomic<int> number_of_instance_inits;
+  std::string shared_memory_region_prefix;
+  int64_t thread_pool_size;
+
+// FIXME [DLIS-5969]: Enable for Windows when custom execution environments
+// are supported.
+#ifndef _WIN32
+  std::unique_ptr<EnvironmentManager> env_manager;
+#endif
+  std::string runtime_modeldir;
+};
+
+class ModelState : public BackendModel {
+ public:
+  static TRITONSERVER_Error* Create(
+      TRITONBACKEND_Model* triton_model, ModelState** state);
+
+  // Get backend state
+  BackendState* StateForBackend() { return backend_state_; }
+
+  // Get the Python execution environment
+  std::string PythonExecutionEnv() { return python_execution_env_; }
+
+  // Force CPU only tensors
+  bool ForceCPUOnlyInputTensors() { return force_cpu_only_input_tensors_; }
+
+  // Is decoupled API being used.
+  bool IsDecoupled() { return decoupled_; }
+
+  // Set decoupled mode
+  void SetDecoupled(bool decoupled) { decoupled_ = decoupled; }
+
+  // Returns the value in the `runtime_modeldir_` field
+  std::string RuntimeModelDir() { return runtime_modeldir_; }
+
+  // Launch auto-complete stub process.
+  TRITONSERVER_Error* LaunchAutoCompleteStubProcess();
+
+  // Validate Model Configuration
+  TRITONSERVER_Error* ValidateModelConfig();
+
+  // Overrides `BackendModel::SetModelConfig` to also
+  // set `ModelState::decoupled_`
+  TRITONSERVER_Error* SetModelConfig();
+
+  // Auto-complete stub
+  std::unique_ptr<StubLauncher>& Stub() { return auto_complete_stub_; }
+
+ private:
+  ModelState(TRITONBACKEND_Model* triton_model);
+  BackendState* backend_state_;
+  std::string python_execution_env_;
+  bool force_cpu_only_input_tensors_;
+  bool decoupled_;
+  std::string runtime_modeldir_;
+  std::unique_ptr<StubLauncher> auto_complete_stub_;
+};
+
+class ModelInstanceState : public BackendModelInstance {
+  ModelInstanceState(
+      ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance);
+
+  TRITONBACKEND_Model* triton_model_;
+  std::unique_ptr<StubLauncher> model_instance_stub_;
+  std::vector<intptr_t> closed_requests_;
+  std::mutex closed_requests_mutex_;
+
+  std::thread stub_to_parent_queue_monitor_;
+  bool stub_to_parent_thread_;
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::unique_ptr<IPCMessage> received_message_;
+  std::vector<std::future<void>> futures_;
+  std::unique_ptr<boost::asio::thread_pool> thread_pool_;
+  std::unordered_map<intptr_t, std::shared_ptr<InferPayload>> infer_payload_;
+  std::mutex infer_payload_mu_;
+  std::unique_ptr<RequestExecutor> request_executor_;
+
+ public:
+  static TRITONSERVER_Error* Create(
+      ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance,
+      ModelInstanceState** model_instance_state);
+
+  ~ModelInstanceState();
+
+  // Launch stub process.
+  TRITONSERVER_Error* LaunchStubProcess();
+
+  void ResponseSendDecoupled(std::shared_ptr<IPCMessage> response_send_message);
+
+  // The parent message queue is monitored only by this function during the
+  // execute phase. No other thread should pop any message from the message
+  // queue.
+  void MessageQueueMonitor();
+
+  // This function is executed on a separate thread and monitors the queue for
+  // message sent from stub to parent process.
+  void StubToParentMQMonitor();
+
+  // Process the log request.
+  void ProcessLogRequest(const std::unique_ptr<IPCMessage>& message);
+
+  // Convert TRITONBACKEND_Input to Python backend tensors.
+  TRITONSERVER_Error* GetInputTensor(
+      const uint32_t input_idx, std::shared_ptr<PbTensor>& input_tensor,
+      TRITONBACKEND_Request* request,
+      std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses);
+
+  // Process all the requests in the decoupled mode.
+  TRITONSERVER_Error* ProcessRequests(
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      std::vector<std::unique_ptr<InferRequest>>& pb_infer_requests,
+      PbMetricReporter& pb_metric_reporter);
+
+  bool ExistsInClosedRequests(intptr_t closed_request);
+
+  // Execute a BLS Request
+  void ExecuteBLSRequest(
+      std::shared_ptr<IPCMessage> ipc_message, const bool is_stream);
+
+  // Cleanup BLS responses
+  void CleanupBLSResponses();
+
+  // Check the incoming requests for errors
+  TRITONSERVER_Error* CheckIncomingRequests(
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      size_t& total_batch_size);
+
+  // Set error for response send message
+  void SetErrorForResponseSendMessage(
+      ResponseSendMessage* response_send_message,
+      std::shared_ptr<TRITONSERVER_Error*> error,
+      std::unique_ptr<PbString>& error_message);
+
+  TRITONSERVER_Error* SaveRequestsToSharedMemory(
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      std::vector<std::unique_ptr<InferRequest>>& pb_inference_requests,
+      AllocatedSharedMemory<char>& request_batch,
+      std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses);
+
+  void SendMessageAndReceiveResponse(
+      bi::managed_external_buffer::handle_t message,
+      bi::managed_external_buffer::handle_t& response,
+      std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses,
+      TRITONBACKEND_Request** requests, const uint32_t request_count);
+
+  void RespondErrorToAllRequests(
+      const char* message,
+      std::shared_ptr<std::vector<TRITONBACKEND_Response*>>& responses,
+      TRITONBACKEND_Request** requests, const uint32_t request_count);
+
+  // void SendMessageToStub(bi::managed_external_buffer::handle_t message);
+  TRITONSERVER_Error* SendMessageToStub(
+      bi::managed_external_buffer::handle_t message);
+
+  // Checks whether the stub process is live
+  bool IsStubProcessAlive();
+
+  // Model instance stub
+  std::unique_ptr<StubLauncher>& Stub() { return model_instance_stub_; }
+
+  // Stop the stub_to_parent_queue_monitor thread
+  void TerminateMonitor();
+
+  // Start the stub_to_parent_queue_monitor thread
+  void StartMonitor();
+
+  // Send bls decoupled response to the stub process
+  void SendBLSDecoupledResponse(std::unique_ptr<InferResponse> infer_response);
+
+  // Prepare the response batch object
+  void PrepareResponseBatch(
+      ResponseBatch** response_batch,
+      AllocatedSharedMemory<char>& response_batch_shm,
+      std::unique_ptr<IPCMessage>* ipc_message,
+      bi::managed_external_buffer::handle_t** response_handle);
+
+  // Prepare the response handle
+  void PrepareResponseHandle(
+      std::unique_ptr<InferResponse>* infer_response,
+      bi::managed_external_buffer::handle_t* response_handle);
+
+  // Process the decoupled cleanup request for InferPayload and ResponseFactory
+  void ProcessCleanupRequest(const std::unique_ptr<IPCMessage>& message);
+
+  // Process cancelling a BLS request
+  void ProcessCancelBLSRequest(const std::unique_ptr<IPCMessage>& message);
+
+  // Process request cancellation query
+  void ProcessIsRequestCancelled(const std::unique_ptr<IPCMessage>& message);
+
+  // Process a message. The function 'request_handler' is invoked
+  // to handle the request. T should be either 'MetricFamily', 'Metric' or
+  // 'ModelLoader', and MessageType should be either 'MetricFamilyMessage',
+  // 'MetricMessage' or 'ModelLoaderMessage'.
+  template <typename T, typename MessageType>
+  void ProcessMessage(
+      const std::unique_ptr<IPCMessage>& message,
+      std::function<void(std::unique_ptr<T>&, MessageType*)> request_handler);
+
+  // Process a metric family request
+  void ProcessMetricFamilyRequest(const std::unique_ptr<IPCMessage>& message);
+
+  // Process a metric request
+  void ProcessMetricRequest(const std::unique_ptr<IPCMessage>& message);
+
+  // Process a model control request
+  void ProcessModelControlRequest(const std::unique_ptr<IPCMessage>& message);
+
+  // Attempt to share CUDA memory pool with the stub process
+  void ShareCUDAMemoryPool(const int32_t device_id);
+};
+}}}  // namespace triton::backend::python
diff --git a/src/request_executor.cc b/src/request_executor.cc
new file mode 100644
index 00000000..716d3c56
--- /dev/null
+++ b/src/request_executor.cc
@@ -0,0 +1,541 @@
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "request_executor.h"
+
+#include <future>
+
+#include "correlation_id.h"
+#include "pb_utils.h"
+#include "scoped_defer.h"
+#include "triton/backend/backend_common.h"
+#include "triton/core/tritonserver.h"
+
+namespace triton { namespace backend { namespace python {
+
+TRITONSERVER_Error*
+CreateTritonErrorFromException(const PythonBackendException& pb_exception)
+{
+  return TRITONSERVER_ErrorNew(
+      TRITONSERVER_ERROR_INTERNAL, pb_exception.what());
+}
+
+TRITONSERVER_Error*
+MemoryTypeToTritonMemoryType(
+    TRITONSERVER_MemoryType* triton_memory_type,
+    const PreferredMemory::MemoryType& memory_type)
+{
+  switch (memory_type) {
+    case PreferredMemory::MemoryType::kCPU:
+      *triton_memory_type = TRITONSERVER_MEMORY_CPU;
+      break;
+    case PreferredMemory::MemoryType::kGPU:
+      *triton_memory_type = TRITONSERVER_MEMORY_GPU;
+      break;
+
+    default:
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL, "Unknown memory type");
+  }
+
+  return nullptr;
+}
+
+void
+InferRequestComplete(
+    TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
+{
+  if (request != nullptr) {
+    RequestCompletionUserp* completion_userp =
+        reinterpret_cast<RequestCompletionUserp*>(userp);
+    completion_userp->infer_payload->SetRequestAddress(0L);
+
+    LOG_IF_ERROR(
+        TRITONSERVER_InferenceRequestDelete(request),
+        "Failed to delete inference request.");
+
+    delete completion_userp;
+  }
+}
+
+void
+InferResponseComplete(
+    TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp)
+{
+  auto linfer_payload = reinterpret_cast<InferPayload*>(userp);
+  std::shared_ptr<InferPayload> infer_payload = linfer_payload->GetPtr();
+  std::unique_ptr<InferResponse> infer_response;
+  std::vector<std::shared_ptr<PbTensor>> output_tensors;
+  std::shared_ptr<PbError> pb_error;
+  std::string parameters_string;
+  TRITONSERVER_Error_Code error_code = TRITONSERVER_ERROR_INTERNAL;
+
+  if (response != nullptr) {
+    try {
+      TRITONSERVER_Error* server_error =
+          TRITONSERVER_InferenceResponseError(response);
+      if (server_error != nullptr) {
+        error_code = TRITONSERVER_ErrorCode(server_error);
+      }
+      THROW_IF_TRITON_ERROR(server_error);
+
+      uint32_t output_count;
+      THROW_IF_TRITON_ERROR(
+          TRITONSERVER_InferenceResponseOutputCount(response, &output_count));
+
+      for (uint32_t idx = 0; idx < output_count; ++idx) {
+        const char* cname;
+        TRITONSERVER_DataType datatype;
+        const int64_t* shape;
+        uint64_t dim_count;
+        const void* base;
+        size_t byte_size;
+        TRITONSERVER_MemoryType memory_type;
+        int64_t memory_type_id;
+        void* userp;
+
+        THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseOutput(
+            response, idx, &cname, &datatype, &shape, &dim_count, &base,
+            &byte_size, &memory_type, &memory_type_id, &userp));
+        std::string sname = cname;
+        std::vector<int64_t> dims_vector{shape, shape + dim_count};
+
+        if (memory_type != TRITONSERVER_MEMORY_GPU) {
+          if (byte_size != 0) {
+            std::shared_ptr<PbTensor> pb_tensor = std::make_shared<PbTensor>(
+                sname, dims_vector, datatype, memory_type, memory_type_id,
+                const_cast<void*>(base), byte_size,
+                nullptr /* DLManagedTensor */);
+
+            // Load the data so that it is deallocated automatically.
+            std::unique_ptr<PbMemory> pb_memory(
+                reinterpret_cast<PbMemory*>(userp));
+            pb_tensor->SetMemory(std::move(pb_memory));
+            output_tensors.push_back(pb_tensor);
+          } else {
+            output_tensors.push_back(std::make_shared<PbTensor>(
+                sname, dims_vector, datatype, memory_type, memory_type_id,
+                const_cast<void*>(base), byte_size,
+                nullptr /* DLManagedTensor */));
+          }
+        } else {
+          std::shared_ptr<PbTensor> pb_tensor = std::make_shared<PbTensor>(
+              sname, dims_vector, datatype, memory_type, memory_type_id,
+              const_cast<void*>(base), byte_size,
+              nullptr /* DLManagedTensor */);
+
+          std::unique_ptr<PbMemory> pb_memory(
+              reinterpret_cast<PbMemory*>(userp));
+          pb_tensor->SetMemory(std::move(pb_memory));
+          output_tensors.push_back(pb_tensor);
+        }
+      }
+
+      triton::common::TritonJson::Value parameters_json(
+          triton::common::TritonJson::ValueType::OBJECT);
+      uint32_t parameter_count;
+      THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseParameterCount(
+          response, &parameter_count));
+
+      for (size_t i = 0; i < parameter_count; i++) {
+        const char* name;
+        TRITONSERVER_ParameterType type;
+        const void* vvalue;
+        THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseParameter(
+            response, i, &name, &type, &vvalue));
+        if (type == TRITONSERVER_PARAMETER_INT) {
+          THROW_IF_TRITON_ERROR(parameters_json.AddInt(
+              name, *(reinterpret_cast<const int64_t*>(vvalue))));
+        } else if (type == TRITONSERVER_PARAMETER_BOOL) {
+          THROW_IF_TRITON_ERROR(parameters_json.AddBool(
+              name, *(reinterpret_cast<const bool*>(vvalue))));
+        } else if (type == TRITONSERVER_PARAMETER_STRING) {
+          std::string string = reinterpret_cast<const char*>(vvalue);
+          THROW_IF_TRITON_ERROR(parameters_json.AddString(name, string));
+        } else {
+          throw PythonBackendException(
+              (std::string("Unsupported parameter type for parameter '") +
+               name + "'."));
+        }
+      }
+
+      triton::common::TritonJson::WriteBuffer buffer;
+      THROW_IF_TRITON_ERROR(parameters_json.Write(&buffer));
+      parameters_string = buffer.Contents();
+    }
+    catch (const PythonBackendException& pb_exception) {
+      if (response != nullptr) {
+        LOG_IF_ERROR(
+            TRITONSERVER_InferenceResponseDelete(response),
+            "Failed to delete inference response.");
+
+        response = nullptr;
+      }
+      pb_error = std::make_shared<PbError>(pb_exception.what(), error_code);
+      output_tensors.clear();
+    }
+
+    if (!infer_payload->IsDecoupled()) {
+      infer_response = std::make_unique<InferResponse>(
+          output_tensors, pb_error, parameters_string,
+          true /* is_last_response */);
+    } else {
+      if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) {
+        // Not the last response.
+        infer_response = std::make_unique<InferResponse>(
+            output_tensors, pb_error, parameters_string,
+            false /* is_last_response */, userp /* id */);
+      } else {
+        // The last response.
+        infer_response = std::make_unique<InferResponse>(
+            output_tensors, pb_error, parameters_string,
+            true /* is_last_response */, userp /* id */);
+      }
+    }
+
+    LOG_IF_ERROR(
+        TRITONSERVER_InferenceResponseDelete(response),
+        "Failed to release BLS inference response.");
+  } else if (
+      (infer_payload)->IsDecoupled() &&
+      (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
+    // An empty response may be the last response for decoupled models.
+    infer_response = std::make_unique<InferResponse>(
+        output_tensors, pb_error, "" /* parameters */,
+        true /* is_last_response */, userp /* id */);
+  } else {
+    pb_error = std::make_shared<PbError>("Unexpected empty response.");
+    infer_response = std::make_unique<InferResponse>(
+        output_tensors, pb_error, "" /* parameters */,
+        true /* is_last_response */, userp /* id */);
+  }
+
+  infer_payload->SetValue(std::move(infer_response));
+}
+
+TRITONSERVER_Error*
+ResponseAlloc(
+    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
+    size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
+    int64_t preferred_memory_type_id, void* userp, void** buffer,
+    void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
+    int64_t* actual_memory_type_id)
+{
+  auto p = reinterpret_cast<ResponseAllocatorUserp*>(userp);
+  std::unique_ptr<SharedMemoryManager> shm_pool(
+      reinterpret_cast<SharedMemoryManager*>(p->shm_pool));
+
+  ScopedDefer _([&shm_pool] { shm_pool.release(); });
+
+  if (p->preferred_memory.PreferredMemoryType() ==
+      PreferredMemory::MemoryType::kDefault) {
+    *actual_memory_type = preferred_memory_type;
+    *actual_memory_type_id = preferred_memory_type_id;
+  } else {
+    TRITONSERVER_MemoryType user_preferred_memory_type;
+    RETURN_IF_ERROR(MemoryTypeToTritonMemoryType(
+        &user_preferred_memory_type,
+        p->preferred_memory.PreferredMemoryType()));
+    *actual_memory_type = user_preferred_memory_type;
+    *actual_memory_type_id = p->preferred_memory.PreferredDeviceId();
+  }
+
+  // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
+  // need to do any other book-keeping.
+  if (byte_size == 0) {
+    *buffer = nullptr;
+    *buffer_userp = nullptr;
+  } else {
+    switch (*actual_memory_type) {
+      case TRITONSERVER_MEMORY_CPU:
+#ifndef TRITON_ENABLE_GPU
+      case TRITONSERVER_MEMORY_GPU:
+#endif
+      case TRITONSERVER_MEMORY_CPU_PINNED: {
+        *actual_memory_type = TRITONSERVER_MEMORY_CPU;
+        *actual_memory_type_id = 0;
+        try {
+          std::unique_ptr<PbMemory> pb_memory = PbMemory::Create(
+              shm_pool, *actual_memory_type, *actual_memory_type_id, byte_size,
+              nullptr /* data */, false /* copy_gpu */);
+          *buffer = pb_memory->DataPtr();
+          *buffer_userp = reinterpret_cast<void*>(pb_memory.get());
+          pb_memory.release();
+        }
+        catch (const PythonBackendException& pb_exception) {
+          TRITONSERVER_Error* err =
+              CreateTritonErrorFromException(pb_exception);
+          return err;
+        }
+
+      } break;
+#ifdef TRITON_ENABLE_GPU
+      case TRITONSERVER_MEMORY_GPU: {
+        BackendMemory* backend_memory;
+        std::unique_ptr<BackendMemory> lbackend_memory;
+        try {
+          THROW_IF_TRITON_ERROR(BackendMemory::Create(
+              reinterpret_cast<TRITONBACKEND_MemoryManager*>(
+                  shm_pool->GetCUDAMemoryPoolManager()->TritonMemoryManager()),
+              {BackendMemory::AllocationType::GPU_POOL,
+               BackendMemory::AllocationType::GPU},
+              *actual_memory_type_id, byte_size, &backend_memory));
+          lbackend_memory.reset(backend_memory);
+
+          std::unique_ptr<PbMemory> pb_memory = PbMemory::Create(
+              shm_pool, std::move(lbackend_memory), true /* copy_gpu */);
+          *buffer = pb_memory->DataPtr();
+          *buffer_userp = reinterpret_cast<void*>(pb_memory.get());
+          pb_memory.release();
+        }
+        catch (const PythonBackendException& pb_exception) {
+          TRITONSERVER_Error* err =
+              CreateTritonErrorFromException(pb_exception);
+          return err;
+        }
+        break;
+      }
+#endif
+    }
+  }
+
+  return nullptr;  // Success
+}
+
+void
+InferRequestCancel(intptr_t request_address)
+{
+  if (request_address == 0L) {
+    return;
+  }
+
+  TRITONSERVER_InferenceRequest* irequest =
+      reinterpret_cast<TRITONSERVER_InferenceRequest*>(request_address);
+  THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestCancel(irequest));
+}
+
+TRITONSERVER_Error*
+OutputBufferQuery(
+    TRITONSERVER_ResponseAllocator* allocator, void* userp,
+    const char* tensor_name, size_t* byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
+{
+  // Always attempt to return the memory in the requested memory_type and
+  // memory_type_id.
+  return nullptr;  // Success
+}
+
+TRITONSERVER_Error*
+ResponseRelease(
+    TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id)
+{
+  return nullptr;  // Success
+}
+
+RequestExecutor::RequestExecutor(
+    std::unique_ptr<SharedMemoryManager>& shm_pool, TRITONSERVER_Server* server)
+    : server_(server), shm_pool_(shm_pool)
+{
+  TRITONSERVER_ResponseAllocator* allocator;
+  THROW_IF_TRITON_ERROR(TRITONSERVER_ResponseAllocatorNew(
+      &allocator, ResponseAlloc, ResponseRelease, nullptr /* start_fn */));
+  THROW_IF_TRITON_ERROR(TRITONSERVER_ResponseAllocatorSetQueryFunction(
+      allocator, OutputBufferQuery));
+  response_allocator_ = allocator;
+}
+
+std::future<std::unique_ptr<InferResponse>>
+RequestExecutor::Infer(
+    std::shared_ptr<InferRequest>& infer_request,
+    std::shared_ptr<InferPayload>& infer_payload)
+{
+  std::future<std::unique_ptr<InferResponse>> response_future;
+  std::unique_ptr<InferResponse> infer_response;
+  bool is_ready = false;
+  const char* model_name = infer_request->ModelName().c_str();
+  TRITONSERVER_InferenceRequest* irequest = nullptr;
+  RequestCompletionUserp* completion_userp = nullptr;
+
+  try {
+    int64_t model_version = infer_request->ModelVersion();
+    THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelIsReady(
+        server_, model_name, model_version, &is_ready));
+
+    if (!is_ready) {
+      throw PythonBackendException(
+          (std::string("Failed for execute the inference request. Model '") +
+           model_name + "' is not ready.")
+              .c_str());
+    }
+
+    uint32_t txn_flags;
+    THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelTransactionProperties(
+        server_, model_name, model_version, &txn_flags, nullptr /* voidp */));
+    infer_request->SetIsDecoupled(
+        (txn_flags & TRITONSERVER_TXN_DECOUPLED) != 0);
+
+    if (!infer_payload->IsDecoupled() && infer_request->IsDecoupled()) {
+      // Decoupled API is only supported by using stream API
+      throw PythonBackendException(
+          std::string("Model ") + model_name +
+          " is using the decoupled. The current BLS request call doesn't "
+          "support models using the decoupled transaction policy. Please use "
+          "'decoupled=True' argument to the 'exec' or 'async_exec' calls for "
+          "decoupled models.'");
+    }
+
+    // Inference
+    THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestNew(
+        &irequest, server_, model_name, model_version));
+
+    THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetId(
+        irequest, infer_request->RequestId().c_str()));
+
+    if (infer_request->GetCorrelationId().Type() ==
+        CorrelationIdDataType::UINT64) {
+      THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetCorrelationId(
+          irequest, infer_request->GetCorrelationId().UnsignedIntValue()));
+    } else {
+      THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetCorrelationIdString(
+          irequest, infer_request->GetCorrelationId().StringValue().c_str()));
+    }
+
+    THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetFlags(
+        irequest, infer_request->Flags()));
+
+    THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetTimeoutMicroseconds(
+        irequest, infer_request->Timeout()));
+
+    completion_userp = new RequestCompletionUserp(infer_payload);
+    THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback(
+        irequest, InferRequestComplete,
+        reinterpret_cast<void*>(completion_userp)));
+
+    TRITONSERVER_InferenceTrace* trace = nullptr;
+    if (infer_request->GetTrace().TritonTrace() != nullptr) {
+      THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceTraceSpawnChildTrace(
+          reinterpret_cast<TRITONSERVER_InferenceTrace*>(
+              infer_request->GetTrace().TritonTrace()),
+          &trace));
+    }
+
+    const std::string& param_str = infer_request->Parameters();
+    triton::common::TritonJson::Value param;
+    THROW_IF_TRITON_ERROR(param.Parse(param_str.c_str(), param_str.length()));
+    std::vector<std::string> param_keys;
+    THROW_IF_TRITON_ERROR(param.Members(&param_keys));
+    for (const auto& key : param_keys) {
+      triton::common::TritonJson::Value value;
+      if (!param.Find(key.c_str(), &value)) {
+        throw PythonBackendException("Unexpected missing key on parameters");
+      }
+      if (value.IsString()) {
+        std::string string_value;
+        THROW_IF_TRITON_ERROR(value.AsString(&string_value));
+        THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetStringParameter(
+            irequest, key.c_str(), string_value.c_str()));
+      } else if (value.IsInt()) {
+        int64_t int_value = 0;
+        THROW_IF_TRITON_ERROR(value.AsInt(&int_value));
+        THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetIntParameter(
+            irequest, key.c_str(), int_value));
+      } else if (value.IsBool()) {
+        bool bool_value = false;
+        THROW_IF_TRITON_ERROR(value.AsBool(&bool_value));
+        THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetBoolParameter(
+            irequest, key.c_str(), bool_value));
+      } else {
+        throw PythonBackendException("Unsupported value type on parameters");
+      }
+    }
+
+    for (auto& infer_input : infer_request->Inputs()) {
+      THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestAddInput(
+          irequest, infer_input->Name().c_str(),
+          static_cast<TRITONSERVER_DataType>(infer_input->TritonDtype()),
+          infer_input->Dims().data(), infer_input->Dims().size()));
+
+      THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestAppendInputData(
+          irequest, infer_input->Name().c_str(), infer_input->DataPtr(),
+          infer_input->ByteSize(), infer_input->MemoryType(),
+          infer_input->MemoryTypeId()));
+    }
+
+    for (auto& requested_output_name : infer_request->RequestedOutputNames()) {
+      THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestAddRequestedOutput(
+          irequest, requested_output_name.c_str()));
+    }
+
+    {
+      infer_payload->SetFuture(response_future);
+
+      ResponseAllocatorUserp response_allocator_userp(
+          shm_pool_.get(), infer_request->GetPreferredMemory());
+      infer_payload->SetResponseAllocUserp(response_allocator_userp);
+
+      THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback(
+          irequest, response_allocator_,
+          reinterpret_cast<void*>(infer_payload->ResponseAllocUserp().get()),
+          InferResponseComplete, reinterpret_cast<void*>(infer_payload.get())));
+
+      // Store the inference request address submitted to the Triton server for
+      // retrieval
+      infer_payload->SetRequestAddress(reinterpret_cast<intptr_t>(irequest));
+      infer_payload->SetRequestCancellationFunc(InferRequestCancel);
+
+      THROW_IF_TRITON_ERROR(
+          TRITONSERVER_ServerInferAsync(server_, irequest, trace));
+    }
+  }
+  catch (const PythonBackendException& pb_exception) {
+    infer_payload->SetRequestAddress(0L);
+    if (completion_userp != nullptr) {
+      delete completion_userp;
+    }
+
+    LOG_IF_ERROR(
+        TRITONSERVER_InferenceRequestDelete(irequest),
+        "Failed to delete inference request.");
+
+    throw PythonBackendException(
+        std::string("Model ") + model_name +
+        " - Error when running inference: " + pb_exception.what());
+  }
+
+  return response_future;
+}
+
+RequestExecutor::~RequestExecutor()
+{
+  if (response_allocator_ != nullptr) {
+    LOG_IF_ERROR(
+        TRITONSERVER_ResponseAllocatorDelete(response_allocator_),
+        "Failed to delete allocator.");
+  }
+}
+}}};  // namespace triton::backend::python
diff --git a/src/request_executor.h b/src/request_executor.h
new file mode 100644
index 00000000..07562d6a
--- /dev/null
+++ b/src/request_executor.h
@@ -0,0 +1,62 @@
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <memory>
+
+#include "infer_payload.h"
+#include "infer_request.h"
+#include "infer_response.h"
+
+namespace triton { namespace backend { namespace python {
+
+TRITONSERVER_Error* CreateTritonErrorFromException(
+    const PythonBackendException& pb_exception);
+
+struct RequestCompletionUserp {
+  std::shared_ptr<InferPayload> infer_payload;
+  RequestCompletionUserp(std::shared_ptr<InferPayload>& infer_payload)
+      : infer_payload(infer_payload){};
+};
+
+class RequestExecutor {
+  TRITONSERVER_ResponseAllocator* response_allocator_ = nullptr;
+  TRITONSERVER_Server* server_;
+  std::unique_ptr<SharedMemoryManager>& shm_pool_;
+
+ public:
+  std::future<std::unique_ptr<InferResponse>> Infer(
+      std::shared_ptr<InferRequest>& infer_request,
+      std::shared_ptr<InferPayload>& infer_payload);
+
+  RequestExecutor(
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      TRITONSERVER_Server* server);
+
+  ~RequestExecutor();
+};
+}}}  // namespace triton::backend::python
diff --git a/src/resources/triton_python_backend_utils.py b/src/resources/triton_python_backend_utils.py
index 8657b063..de332cf7 100644
--- a/src/resources/triton_python_backend_utils.py
+++ b/src/resources/triton_python_backend_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,23 +24,25 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
+import json
 import struct
 
+import numpy as np
+
 TRITON_STRING_TO_NUMPY = {
-    'TYPE_BOOL': bool,
-    'TYPE_UINT8': np.uint8,
-    'TYPE_UINT16': np.uint16,
-    'TYPE_UINT32': np.uint32,
-    'TYPE_UINT64': np.uint64,
-    'TYPE_INT8': np.int8,
-    'TYPE_INT16': np.int16,
-    'TYPE_INT32': np.int32,
-    'TYPE_INT64': np.int64,
-    'TYPE_FP16': np.float16,
-    'TYPE_FP32': np.float32,
-    'TYPE_FP64': np.float64,
-    'TYPE_STRING': np.object_
+    "TYPE_BOOL": bool,
+    "TYPE_UINT8": np.uint8,
+    "TYPE_UINT16": np.uint16,
+    "TYPE_UINT32": np.uint32,
+    "TYPE_UINT64": np.uint64,
+    "TYPE_INT8": np.int8,
+    "TYPE_INT16": np.int16,
+    "TYPE_INT32": np.int32,
+    "TYPE_INT64": np.int64,
+    "TYPE_FP16": np.float16,
+    "TYPE_FP32": np.float32,
+    "TYPE_FP64": np.float64,
+    "TYPE_STRING": np.object_,
 }
 
 
@@ -65,16 +67,14 @@ def serialize_byte_tensor(input_tensor):
     """
 
     if input_tensor.size == 0:
-        return None
-
-    # If the input is a tensor of string/bytes objects, then must flatten those into
-    # a 1-dimensional array containing the 4-byte byte size followed by the
-    # actual element bytes. All elements are concatenated together in "C"
-    # order.
-    if (input_tensor.dtype == np.object_) or (input_tensor.dtype.type
-                                              == np.bytes_):
+        return ()
+
+    # If the input is a tensor of string/bytes objects, then must flatten those
+    # into a 1-dimensional array containing the 4-byte byte size followed by the
+    # actual element bytes. All elements are concatenated together in "C" order.
+    if (input_tensor.dtype == np.object_) or (input_tensor.dtype.type == np.bytes_):
         flattened_ls = []
-        for obj in np.nditer(input_tensor, flags=["refs_ok"], order='C'):
+        for obj in np.nditer(input_tensor, flags=["refs_ok"], order="C"):
             # If directly passing bytes to BYTES type,
             # don't convert it to str as Python will encode the
             # bytes which may distort the meaning
@@ -82,12 +82,12 @@ def serialize_byte_tensor(input_tensor):
                 if type(obj.item()) == bytes:
                     s = obj.item()
                 else:
-                    s = str(obj.item()).encode('utf-8')
+                    s = str(obj.item()).encode("utf-8")
             else:
                 s = obj.item()
             flattened_ls.append(struct.pack("<I", len(s)))
             flattened_ls.append(s)
-        flattened = b''.join(flattened_ls)
+        flattened = b"".join(flattened_ls)
         return flattened
     return None
 
@@ -117,237 +117,7 @@ def deserialize_bytes_tensor(encoded_tensor):
         sb = struct.unpack_from("<{}s".format(l), val_buf, offset)[0]
         offset += l
         strs.append(sb)
-    return (np.array(strs, dtype=np.object_))
-
-
-class InferenceRequest:
-    """InferenceRequest represents a request for inference for a model that
-    executes using this backend.
-    Parameters
-    ----------
-    inputs : list
-        A list of Tensor objects, each describing data for an input tensor
-        required by the model
-    request_id : str
-        ID assoiciated with this request, or empty string if no ID is
-        associated with the request.
-    correlation_id : str
-        Correlation ID associated with this request, or empty string if no
-        correlation ID is associated with the request.
-    requested_output_name : list
-        The names of the output tensors that should be calculated and
-        returned for this request.
-    """
-
-    def __init__(self, inputs, request_id, correlation_id,
-                 requested_output_names):
-        self._inputs = inputs
-        self._request_id = request_id
-        self._correlation_id = correlation_id
-        self._requested_output_names = requested_output_names
-
-    def inputs(self):
-        """Get input tensors
-        Returns
-        ----
-        list
-            A list of input Tensor objects
-        """
-        return self._inputs
-
-    def request_id(self):
-        """Get request ID
-        Returns
-        -------
-        str
-            Request ID
-        """
-        return self._request_id
-
-    def correlation_id(self):
-        """Get correlation ID
-        Returns
-        -------
-        int
-            Request correlation ID
-        """
-        return self._correlation_id
-
-    def requested_output_names(self):
-        """Get requested output names
-        Returns
-        -------
-        list
-            A list of strings, each describing the requested output name
-        """
-        return self._requested_output_names
-
-
-class InferenceResponse:
-    """An InfrenceResponse object is used to represent the response to an
-    inference request.
-    Parameters
-    ----------
-    output_tensors : list
-        A list of Tensor objects, each describing data for an output tensor
-        required the InferenceRequest
-    error : TritonError
-        A TritonError object describing any errror encountered while creating
-        resposne
-    """
-
-    def __init__(self, output_tensors, error=None):
-        if type(output_tensors) is not list:
-            raise TritonModelException('"output_tensors" must be a list.')
-
-        self._output_tensors = output_tensors
-        self._err = error
-
-    def output_tensors(self):
-        """Get output tensors
-        Returns
-        -------
-        list
-            A list of Tensor objects
-        """
-        return self._output_tensors
-
-    def has_error(self):
-        """True if response has error
-        Returns
-        -------
-        boolean
-            A boolean indicating whether response has an error
-        """
-
-        return self._err is not None
-
-    def error(self):
-        """Get TritonError for this inference response
-        Returns
-        -------
-        TritonError
-            A TritonError containing the error
-        """
-        return self._err
-
-
-class Tensor:
-    """A Tensor object is used to represent inputs and output data for an
-    InferenceRequest or InferenceResponse.
-    Parameters
-    ----------
-    name : str
-        Tensor name
-    numpy_array : numpy.ndarray
-        A numpy array containing input/output data
-    """
-
-    def __init__(self, name, numpy_array=None, triton_dtype=None):
-        if isinstance(numpy_array, (np.ndarray,)) and \
-            numpy_array.dtype.type == np.str_ or numpy_array.dtype == np.void:
-            raise TritonModelException(
-                'Tensor dtype used for numpy_array is not support by Python backend.'
-                ' Please use np.object_ instead.')
-
-        if triton_dtype is not None:
-            numpy_dtype = triton_to_numpy_type(triton_dtype)
-
-            if numpy_array.dtype != numpy_dtype:
-                # reinterpret the byte array as the correct data type.
-                numpy_array = numpy_array.view(numpy_dtype)
-        else:
-            triton_dtype = numpy_to_triton_type(numpy_array.dtype)
-            if triton_dtype is None:
-                triton_dtype = numpy_to_triton_type(numpy_array.dtype.type)
-
-        self._triton_dtype = triton_dtype
-        self._name = name
-        self._numpy_array = numpy_array
-
-    def name(self):
-        """Get the name of tensor
-        Returns
-        -------
-        str
-            The name of tensor
-        """
-        return self._name
-
-    def triton_dtype(self):
-        """Get triton dtype for the tensor
-        """
-        return self._triton_dtype
-
-    def as_numpy(self):
-        """Get the underlying numpy array
-        Returns
-        -------
-        numpy.ndarray
-            The numpy array
-        """
-        return self._numpy_array
-
-
-class RawData:
-    """Representing a raw data object.
-    """
-
-    def __init__(self, data_ptr, memory_type, memory_type_id, byte_size):
-        self._data_ptr = data_ptr
-        self._memory_type = memory_type
-        self._memory_type_id = memory_type_id
-        self._byte_size = byte_size
-
-
-class TritonError:
-    """Error indicating non-Success status.
-    Parameters
-    ----------
-    msg : str
-        A brief description of error
-    """
-
-    def __init__(self, msg):
-        self._msg = msg
-
-    def __str__(self):
-        msg = super().__str__() if self._msg is None else self._msg
-        return msg
-
-    def message(self):
-        """Get the error message.
-        Returns
-        -------
-        str
-            The message associated with this error, or None if no message.
-        """
-        return self._msg
-
-
-class TritonModelException(Exception):
-    """Exception indicating non-Success status.
-    Parameters
-    ----------
-    msg : str
-        A brief description of error
-    """
-
-    def __init__(self, msg):
-        self._msg = msg
-
-    def __str__(self):
-        msg = super().__str__() if self._msg is None else self._msg
-        return msg
-
-    def message(self):
-        """Get the exception message.
-        Returns
-        -------
-        str
-            The message associated with this exception, or None if no message.
-        """
-        return self._msg
+    return np.array(strs, dtype=np.object_)
 
 
 def get_input_tensor_by_name(inference_request, name):
@@ -373,6 +143,29 @@ def get_input_tensor_by_name(inference_request, name):
     return None
 
 
+def get_output_tensor_by_name(inference_response, name):
+    """Find an output Tensor in the inference_response that has the given
+    name
+    Parameters
+    ----------
+    inference_response : InferenceResponse
+        InferenceResponse object
+    name : str
+        name of the output Tensor object
+    Returns
+    -------
+    Tensor
+        The output Tensor with the specified name, or None if no
+        output Tensor with this name exists
+    """
+    output_tensors = inference_response.output_tensors()
+    for output_tensor in output_tensors:
+        if output_tensor.name() == name:
+            return output_tensor
+
+    return None
+
+
 def get_input_config_by_name(model_config, name):
     """Get input properties corresponding to the input
     with given `name`
@@ -388,10 +181,10 @@ def get_input_config_by_name(model_config, name):
         A dictionary containing all the properties for a given input
         name, or None if no input with this name exists
     """
-    if 'input' in model_config:
-        inputs = model_config['input']
+    if "input" in model_config:
+        inputs = model_config["input"]
         for input_properties in inputs:
-            if input_properties['name'] == name:
+            if input_properties["name"] == name:
                 return input_properties
 
     return None
@@ -412,15 +205,35 @@ def get_output_config_by_name(model_config, name):
         A dictionary containing all the properties for a given output
         name, or None if no output with this name exists
     """
-    if 'output' in model_config:
-        outputs = model_config['output']
+    if "output" in model_config:
+        outputs = model_config["output"]
         for output_properties in outputs:
-            if output_properties['name'] == name:
+            if output_properties["name"] == name:
                 return output_properties
 
     return None
 
 
+def using_decoupled_model_transaction_policy(model_config):
+    """Whether or not the model is configured with decoupled
+    transaction policy.
+    Parameters
+    ----------
+    model_config : dict
+        dictionary object containing the model configuration
+
+    Returns
+    -------
+    bool
+        True if the model is configured with decoupled transaction
+        policy.
+    """
+    if "model_transaction_policy" in model_config:
+        return model_config["model_transaction_policy"]["decoupled"]
+
+    return False
+
+
 def triton_to_numpy_type(data_type):
     if data_type == 1:
         return np.bool_
@@ -481,3 +294,317 @@ def numpy_to_triton_type(data_type):
 
 def triton_string_to_numpy(triton_type_string):
     return TRITON_STRING_TO_NUMPY[triton_type_string]
+
+
+class ModelConfig:
+    """An object of ModelConfig class is used to describe
+    the model configuration for autocomplete.
+    Parameters
+    ----------
+    model_config : ModelConfig Object
+        Object containing the model configuration. Only the max_batch_size, inputs
+        and outputs properties can be modified for auto-complete model configuration.
+    """
+
+    def __init__(self, model_config):
+        self._model_config = json.loads(model_config)
+
+    def as_dict(self):
+        """Provide the read-only access to the model configuration
+        Returns
+        -------
+        dict
+            dictionary type of the model configuration contained in
+            the ModelConfig object
+        """
+        return self._model_config
+
+    def set_max_batch_size(self, max_batch_size):
+        """Set the max batch size for the model.
+        Parameters
+        ----------
+        max_batch_size : int
+            The max_batch_size to be set.
+        Raises
+        ------
+        ValueError
+            If configuration has specified max_batch_size non-zero value which
+            is larger than the max_batch_size to be set for the model.
+        """
+        if self._model_config["max_batch_size"] > max_batch_size:
+            raise ValueError(
+                "configuration specified max_batch_size "
+                + str(self._model_config["max_batch_size"])
+                + ", but in auto-complete-config function for model '"
+                + self._model_config["name"]
+                + "' specified max_batch_size "
+                + str(max_batch_size)
+            )
+        else:
+            self._model_config["max_batch_size"] = max_batch_size
+
+    def set_dynamic_batching(self):
+        """Set dynamic_batching as the scheduler for the model if no scheduler
+        is set. If dynamic_batching is set in the model configuration, then no
+        action is taken and return success.
+        Raises
+        ------
+        ValueError
+            If the 'sequence_batching' or 'ensemble_scheduling' scheduler is
+            set for this model configuration.
+        """
+        found_scheduler = None
+        if "sequence_batching" in self._model_config:
+            found_scheduler = "sequence_batching"
+        elif "ensemble_scheduling" in self._model_config:
+            found_scheduler = "ensemble_scheduling"
+
+        if found_scheduler != None:
+            raise ValueError(
+                "Configuration specified scheduling_choice as '"
+                + found_scheduler
+                + "', but auto-complete-config "
+                "function for model '"
+                + self._model_config["name"]
+                + "' tries to set scheduling_choice as 'dynamic_batching'"
+            )
+
+        if "dynamic_batching" not in self._model_config:
+            self._model_config["dynamic_batching"] = {}
+
+    def add_input(self, input):
+        """Add the input for the model.
+        Parameters
+        ----------
+        input : dict
+            The input to be added.
+        Raises
+        ------
+        ValueError
+            If input contains property other than 'name', 'data_type',
+            'dims', 'optional' or any of the non-optional properties
+            are not set, or if an input with the same name already exists
+            in the configuration but has different data_type or dims property
+        """
+        valid_properties = ["name", "data_type", "dims", "optional"]
+        for current_property in input:
+            if current_property not in valid_properties:
+                raise ValueError(
+                    "input '"
+                    + input["name"]
+                    + "' in auto-complete-config function for model '"
+                    + self._model_config["name"]
+                    + "' contains property other than 'name', 'data_type', 'dims' and 'optional'."
+                )
+
+        if "name" not in input:
+            raise ValueError(
+                "input in auto-complete-config function for model '"
+                + self._model_config["name"]
+                + "' is missing 'name' property."
+            )
+        elif "data_type" not in input:
+            raise ValueError(
+                "input '"
+                + input["name"]
+                + "' in auto-complete-config function for model '"
+                + self._model_config["name"]
+                + "' is missing 'data_type' property."
+            )
+        elif "dims" not in input:
+            raise ValueError(
+                "input '"
+                + input["name"]
+                + "' in auto-complete-config function for model '"
+                + self._model_config["name"]
+                + "' is missing 'dims' property."
+            )
+
+        for current_input in self._model_config["input"]:
+            if input["name"] == current_input["name"]:
+                if (
+                    current_input["data_type"] != "TYPE_INVALID"
+                    and current_input["data_type"] != input["data_type"]
+                ):
+                    raise ValueError(
+                        "unable to load model '"
+                        + self._model_config["name"]
+                        + "', configuration expects datatype "
+                        + current_input["data_type"]
+                        + " for input '"
+                        + input["name"]
+                        + "', model provides "
+                        + input["data_type"]
+                    )
+                elif current_input["dims"] and current_input["dims"] != input["dims"]:
+                    raise ValueError(
+                        "model '"
+                        + self._model_config["name"]
+                        + "', tensor '"
+                        + input["name"]
+                        + "': the model expects dims "
+                        + str(input["dims"])
+                        + " but the model configuration specifies dims "
+                        + str(current_input["dims"])
+                    )
+                elif (
+                    "optional" in current_input
+                    and "optional" in input
+                    and current_input["optional"] != input["optional"]
+                ):
+                    raise ValueError(
+                        "model '"
+                        + self._model_config["name"]
+                        + "', tensor '"
+                        + input["name"]
+                        + "': the model expects optional "
+                        + str(input["optional"])
+                        + " but the model configuration specifies optional "
+                        + str(current_input["optional"])
+                    )
+                else:
+                    current_input["data_type"] = input["data_type"]
+                    current_input["dims"] = input["dims"]
+                    if "optional" in input:
+                        current_input["optional"] = input["optional"]
+                    return
+
+        self._model_config["input"].append(input)
+
+    def add_output(self, output):
+        """Add the output for the model.
+        Parameters
+        ----------
+        output : dict
+            The output to be added.
+        Raises
+        ------
+        ValueError
+            If output contains property other than 'name', 'data_type'
+            and 'dims' or any of the properties are not set, or if an
+            output with the same name already exists in the configuration
+            but has different data_type or dims property
+        """
+        valid_properties = ["name", "data_type", "dims"]
+        for current_property in output:
+            if current_property not in valid_properties:
+                raise ValueError(
+                    "output '"
+                    + output["name"]
+                    + "' in auto-complete-config function for model '"
+                    + self._model_config["name"]
+                    + "' contains property other than 'name', 'data_type' and 'dims'."
+                )
+
+        if "name" not in output:
+            raise ValueError(
+                "output in auto-complete-config function for model '"
+                + self._model_config["name"]
+                + "' is missing 'name' property."
+            )
+        elif "data_type" not in output:
+            raise ValueError(
+                "output '"
+                + output["name"]
+                + "' in auto-complete-config function for model '"
+                + self._model_config["name"]
+                + "' is missing 'data_type' property."
+            )
+        elif "dims" not in output:
+            raise ValueError(
+                "output '"
+                + output["name"]
+                + "' in auto-complete-config function for model '"
+                + self._model_config["name"]
+                + "' is missing 'dims' property."
+            )
+
+        for current_output in self._model_config["output"]:
+            if output["name"] == current_output["name"]:
+                if (
+                    current_output["data_type"] != "TYPE_INVALID"
+                    and current_output["data_type"] != output["data_type"]
+                ):
+                    raise ValueError(
+                        "unable to load model '"
+                        + self._model_config["name"]
+                        + "', configuration expects datatype "
+                        + current_output["data_type"]
+                        + " for output '"
+                        + output["name"]
+                        + "', model provides "
+                        + output["data_type"]
+                    )
+                elif (
+                    current_output["dims"] and current_output["dims"] != output["dims"]
+                ):
+                    raise ValueError(
+                        "model '"
+                        + self._model_config["name"]
+                        + "', tensor '"
+                        + output["name"]
+                        + "': the model expects dims "
+                        + str(output["dims"])
+                        + " but the model configuration specifies dims "
+                        + str(current_output["dims"])
+                    )
+                else:
+                    current_output["data_type"] = output["data_type"]
+                    current_output["dims"] = output["dims"]
+                    return
+
+        self._model_config["output"].append(output)
+
+    def set_model_transaction_policy(self, transaction_policy_dict):
+        """
+        Set model transaction policy for the model.
+        Parameters
+        ----------
+        transaction_policy_dict : dict
+            The dict, containing all properties to be set as a part
+            of `model_transaction_policy` field.
+        Raises
+        ------
+        ValueError
+            If transaction_policy_dict contains property other
+            than 'decoupled', or if `model_transaction_policy` already exists
+            in the configuration, but has different `decoupled` property.
+        """
+        valid_properties = ["decoupled"]
+        for current_property in transaction_policy_dict.keys():
+            if current_property not in valid_properties:
+                raise ValueError(
+                    "model transaction property in auto-complete-config "
+                    + "function for model '"
+                    + self._model_config["name"]
+                    + "' contains property other than 'decoupled'."
+                )
+
+        if "model_transaction_policy" not in self._model_config:
+            self._model_config["model_transaction_policy"] = {}
+
+        if "decoupled" in transaction_policy_dict.keys():
+            if (
+                "decoupled" in self._model_config["model_transaction_policy"]
+                and self._model_config["model_transaction_policy"]["decoupled"]
+                != transaction_policy_dict["decoupled"]
+            ):
+                raise ValueError(
+                    "trying to change decoupled property in auto-complete-config "
+                    + "for model '"
+                    + self._model_config["name"]
+                    + "', which is already set to '"
+                    + str(self._model_config["model_transaction_policy"]["decoupled"])
+                    + "'."
+                )
+
+            self._model_config["model_transaction_policy"][
+                "decoupled"
+            ] = transaction_policy_dict["decoupled"]
+
+
+TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1
+TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2
+TRITONSERVER_RESPONSE_COMPLETE_FINAL = 1
+TRITONSERVER_REQUEST_RELEASE_ALL = 1
+TRITONSERVER_REQUEST_RELEASE_RESCHEDULE = 2
diff --git a/src/response_sender.cc b/src/response_sender.cc
new file mode 100644
index 00000000..ef3b09dd
--- /dev/null
+++ b/src/response_sender.cc
@@ -0,0 +1,289 @@
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "response_sender.h"
+
+#include <boost/interprocess/sync/interprocess_condition.hpp>
+#include <boost/interprocess/sync/interprocess_mutex.hpp>
+
+#include "pb_stub.h"
+#include "pb_stub_utils.h"
+#include "scoped_defer.h"
+
+namespace triton { namespace backend { namespace python {
+
+void
+CheckResponseSenderArguments(
+    const std::shared_ptr<InferResponse>& response, const uint32_t flags)
+{
+  // Check the correctness of the provided flags.
+  if (flags != TRITONSERVER_RESPONSE_COMPLETE_FINAL && flags != 0) {
+    throw PythonBackendException(
+        "Unable to send response. Unsupported flag provided.");
+  }
+
+  if (flags == 0 && response == nullptr) {
+    throw PythonBackendException(
+        "Inference Response object must be provided when the response flags is "
+        "set to zero.");
+  }
+}
+
+ResponseSender::ResponseSender(
+    intptr_t request_address, intptr_t response_factory_address,
+    bool const* is_decoupled,
+    const std::set<std::string>& requested_output_names,
+    std::unique_ptr<SharedMemoryManager>& shm_pool,
+    const std::shared_ptr<PbCancel>& pb_cancel)
+    : request_address_(request_address),
+      response_factory_address_(response_factory_address),
+      is_decoupled_(is_decoupled),
+      requested_output_names_(requested_output_names), shm_pool_(shm_pool),
+      pb_cancel_(pb_cancel), closed_(false), number_of_response_sent_(0),
+      response_factory_deleted_(false)
+{
+}
+
+ResponseSender::~ResponseSender()
+{
+  DeleteResponseFactory();
+}
+
+void
+ResponseSender::UpdateStateAndCounters(
+    InferResponse* response, const uint32_t flags)
+{
+  if (is_decoupled_ == nullptr) {
+    // TODO: Can a model access the response sender on a BLS infer request?
+    throw PythonBackendException(
+        "Unable to send response. Response sender has no reference to the "
+        "decoupled state of the model.");
+  }
+  bool is_decoupled = *is_decoupled_;
+
+  std::lock_guard<std::mutex> lk(mu_);
+
+  if (!is_decoupled) {
+    if (response != nullptr && number_of_response_sent_ > 0) {
+      throw PythonBackendException(
+          "Unable to send response. Non-decoupled model cannot send more than "
+          "one response.");
+    }
+    if (response == nullptr && flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL &&
+        number_of_response_sent_ == 0) {
+      throw PythonBackendException(
+          "Unable to send response. Non-decoupled model cannot send complete "
+          "final before sending a response.");
+    }
+  }
+
+  if (closed_) {
+    throw PythonBackendException(
+        "Unable to send response. Response sender has been closed.");
+  }
+
+  if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) {
+    response_factory_deleted_.exchange(true);
+    closed_ = true;
+  }
+  number_of_response_sent_++;
+}
+
+void
+ResponseSender::Send(
+    std::shared_ptr<InferResponse> infer_response, const uint32_t flags)
+{
+  // Release the GIL. This avoids a potential deadlock situation in the parent
+  // process, where every thread in the thread pool is indirectly waiting for a
+  // function in the stub process that acquires the GIL. Meanwhile, the current
+  // thread, which holds the GIL, is also waiting for the parent side to have
+  // the next available thread to pick up the job during resource contention.
+  py::gil_scoped_release release;
+
+  CheckResponseSenderArguments(infer_response, flags);
+  UpdateStateAndCounters(infer_response.get(), flags);
+  if (infer_response) {
+    infer_response->PruneOutputTensors(requested_output_names_);
+  }
+
+  std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+
+  AllocatedSharedMemory<ResponseSendMessage> response_send_message =
+      shm_pool_->Construct<ResponseSendMessage>(
+          1 /* count */, true /* aligned */);
+
+  if (infer_response) {
+    infer_response->SaveToSharedMemory(shm_pool_, false /* copy_gpu */);
+  }
+
+  ResponseSendMessage* send_message_payload = response_send_message.data_.get();
+  new (&(send_message_payload->mu)) bi::interprocess_mutex;
+  new (&(send_message_payload->cv)) bi::interprocess_condition;
+
+  send_message_payload->is_stub_turn = false;
+  send_message_payload->request_address = request_address_;
+  send_message_payload->response_factory_address = response_factory_address_;
+
+  if (infer_response) {
+    send_message_payload->response = infer_response->ShmHandle();
+  } else {
+    send_message_payload->response = 0;
+  }
+
+  send_message_payload->has_error = false;
+  send_message_payload->is_error_set = false;
+  send_message_payload->flags = flags;
+
+  std::unique_ptr<IPCMessage> ipc_message =
+      IPCMessage::Create(shm_pool_, false /* inline_response */);
+
+  ipc_message->Command() = PYTHONSTUB_ResponseSend;
+  ipc_message->Args() = response_send_message.handle_;
+
+  ScopedDefer _([send_message_payload] {
+    {
+      bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
+      send_message_payload->is_stub_turn = false;
+      send_message_payload->cv.notify_all();
+    }
+  });
+
+  {
+    bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
+    // The server will destruct the response factory if the final flag is set.
+    if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) {
+      response_factory_deleted_.exchange(true);
+    }
+    stub->SendIPCUtilsMessage(ipc_message);
+    while (!send_message_payload->is_stub_turn) {
+      send_message_payload->cv.wait(guard);
+    }
+  }
+
+  bool has_gpu_output = false;
+  std::vector<std::shared_ptr<PbTensor>> gpu_tensors;
+  if (infer_response) {
+    for (auto& tensor : infer_response->OutputTensors()) {
+      if (!tensor->IsCPU()) {
+        has_gpu_output = true;
+        gpu_tensors.push_back(tensor);
+      }
+    }
+  }
+
+  if (has_gpu_output) {
+    ScopedDefer _([send_message_payload] {
+      bi::scoped_lock<bi::interprocess_mutex> guard{send_message_payload->mu};
+      send_message_payload->is_stub_turn = false;
+      send_message_payload->cv.notify_one();
+      while (!send_message_payload->is_stub_turn) {
+        // Wait for the stub process to send the response and populate error
+        // message if any.
+        send_message_payload->cv.wait(guard);
+      }
+    });
+
+    AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_handle =
+        shm_pool_->Load<GPUBuffersShm>(
+            send_message_payload->gpu_buffers_handle);
+    if (!gpu_buffers_handle.data_->success) {
+      std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
+          shm_pool_, gpu_buffers_handle.data_->error);
+      throw PythonBackendException(
+          "Failed to load GPU buffers: " + error->String());
+    }
+
+    AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
+        gpu_buffers_handle_shm =
+            shm_pool_->Load<bi::managed_external_buffer::handle_t>(
+                gpu_buffers_handle.data_->buffers);
+    uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count;
+    if (gpu_tensors.size() != gpu_buffer_count) {
+      throw PythonBackendException(
+          std::string(
+              "GPU buffers size does not match the provided buffers: ") +
+          std::to_string(gpu_tensors.size()) +
+          " != " + std::to_string(gpu_buffer_count));
+    }
+
+    std::vector<std::unique_ptr<PbMemory>> dst_buffers;
+
+    for (size_t i = 0; i < gpu_tensors.size(); i++) {
+      std::unique_ptr<PbMemory> dst_buffer = PbMemory::LoadFromSharedMemory(
+          shm_pool_, gpu_buffers_handle_shm.data_.get()[i],
+          true /* open_cuda_handle */);
+      dst_buffers.emplace_back(std::move(dst_buffer));
+      std::shared_ptr<PbTensor>& src_buffer = gpu_tensors[i];
+      PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory());
+    }
+  }
+
+  if (send_message_payload->has_error) {
+    if (send_message_payload->is_error_set) {
+      std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
+          shm_pool_, send_message_payload->error);
+      throw PythonBackendException(error->String());
+    } else {
+      throw PythonBackendException(
+          "An error occurred while sending a response.");
+    }
+  }
+}
+
+bool
+ResponseSender::IsCancelled()
+{
+  return pb_cancel_->IsCancelled();
+}
+
+bool
+ResponseSender::IsClosed()
+{
+  std::lock_guard<std::mutex> lk(mu_);
+  return closed_;
+}
+
+void
+ResponseSender::Close()
+{
+  std::lock_guard<std::mutex> lk(mu_);
+  closed_ = true;
+  response_factory_deleted_.exchange(true);
+}
+
+void
+ResponseSender::DeleteResponseFactory()
+{
+  bool already_deleted = response_factory_deleted_.exchange(true);
+  if (!already_deleted) {
+    std::unique_ptr<Stub>& stub = Stub::GetOrCreateInstance();
+    stub->EnqueueCleanupId(
+        reinterpret_cast<void*>(response_factory_address_),
+        PYTHONSTUB_DecoupledResponseFactoryCleanup);
+  }
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/response_sender.h b/src/response_sender.h
new file mode 100644
index 00000000..a696f9eb
--- /dev/null
+++ b/src/response_sender.h
@@ -0,0 +1,72 @@
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+
+#include "infer_response.h"
+#include "pb_cancel.h"
+#include "shm_manager.h"
+
+namespace triton { namespace backend { namespace python {
+
+class ResponseSender {
+ public:
+  ResponseSender(
+      intptr_t request_address, intptr_t response_factory_address,
+      bool const* is_decoupled,
+      const std::set<std::string>& requested_output_names,
+      std::unique_ptr<SharedMemoryManager>& shm_pool,
+      const std::shared_ptr<PbCancel>& pb_cancel);
+  intptr_t ResponseFactory() { return response_factory_address_; }
+  ~ResponseSender();
+  void Send(std::shared_ptr<InferResponse> response, const uint32_t flags);
+  bool IsCancelled();
+  void UpdateStateAndCounters(InferResponse* response, const uint32_t flags);
+
+  // Can be useful at stopping the model from sending any more responses.
+  void Close();
+  bool IsClosed();
+
+ private:
+  void DeleteResponseFactory();
+
+  intptr_t request_address_;
+  intptr_t response_factory_address_;
+  bool const* is_decoupled_;
+  std::set<std::string> requested_output_names_;
+  std::unique_ptr<SharedMemoryManager>& shm_pool_;
+  std::shared_ptr<PbCancel> pb_cancel_;
+
+  std::mutex mu_;
+  bool closed_;
+  size_t number_of_response_sent_;
+
+  std::atomic<bool> response_factory_deleted_;
+};
+}}}  // namespace triton::backend::python
diff --git a/src/scoped_defer.cc b/src/scoped_defer.cc
new file mode 100644
index 00000000..9c33bfd2
--- /dev/null
+++ b/src/scoped_defer.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "scoped_defer.h"
+
+namespace triton { namespace backend { namespace python {
+ScopedDefer::ScopedDefer(std::function<void()> task)
+{
+  task_ = task;
+  done_ = false;
+}
+
+void
+ScopedDefer::Complete()
+{
+  if (!done_) {
+    task_();
+    done_ = true;
+  }
+}
+
+ScopedDefer::~ScopedDefer()
+{
+  if (!done_) {
+    task_();
+  }
+}
+
+}}};  // namespace triton::backend::python
diff --git a/src/scoped_defer.h b/src/scoped_defer.h
new file mode 100644
index 00000000..eb52d6b6
--- /dev/null
+++ b/src/scoped_defer.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+#include <functional>
+
+namespace triton { namespace backend { namespace python {
+class ScopedDefer {
+ public:
+  ScopedDefer(std::function<void()> task);
+  ~ScopedDefer();
+  void Complete();
+
+ private:
+  std::function<void()> task_;
+  bool done_;
+};
+
+}}}  // namespace triton::backend::python
diff --git a/src/shm_manager.cc b/src/shm_manager.cc
index d4df93b3..134cee6f 100644
--- a/src/shm_manager.cc
+++ b/src/shm_manager.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -26,137 +26,216 @@
 
 #include "shm_manager.h"
 
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/vfs.h>
-#include <unistd.h>
-#include <string>
-#include "pb_utils.h"
+#include <boost/interprocess/managed_external_buffer.hpp>
+#include <boost/interprocess/mapped_region.hpp>
+#include <boost/interprocess/shared_memory_object.hpp>
+#include <iostream>
 
 namespace triton { namespace backend { namespace python {
 
-namespace bi = boost::interprocess;
+void
+CUDAMemoryPoolManager::SetCUDAPoolAddress(
+    const int32_t device_id, void* cuda_pool_address)
+{
+  std::lock_guard<std::mutex> lock(mu_);
+  cuda_pool_address_map_[device_id] = cuda_pool_address;
+}
 
-SharedMemory::SharedMemory(
-    const std::string& shm_key, int64_t default_byte_size,
-    int64_t shm_growth_bytes, bool truncate)
+void*
+CUDAMemoryPoolManager::CUDAPoolAddress(const int32_t device_id)
 {
-  if (truncate) {
-    shm_obj_ = bi::shared_memory_object(
-        bi::open_or_create, shm_key.c_str(), bi::read_write);
+  if (cuda_pool_address_map_.find(device_id) != cuda_pool_address_map_.end()) {
+    return cuda_pool_address_map_[device_id];
   } else {
-    shm_obj_ = bi::shared_memory_object(
-        bi::open_only, shm_key.c_str(), bi::read_write);
+    throw PythonBackendException(
+        "CUDA pool address for device " + std::to_string(device_id) +
+        " is not set.");
   }
+}
+
+void
+CUDAMemoryPoolManager::SetTritonMemoryManager(void* triton_memory_manager)
+{
+  triton_memory_manager_ = triton_memory_manager;
+}
+
+void*
+CUDAMemoryPoolManager::TritonMemoryManager()
+{
+  return triton_memory_manager_;
+}
 
+bool
+CUDAMemoryPoolManager::UseCudaSharedPool(const int32_t device_id)
+{
+  return (cuda_pool_address_map_.find(device_id) !=
+          cuda_pool_address_map_.end()) &&
+         (cuda_pool_address_map_[device_id] != nullptr) &&
+         (triton_memory_manager_ != nullptr);
+}
+
+std::unordered_map<int32_t, void*>&
+CUDAMemoryPoolManager::CUDAPoolAddressMap()
+{
+  return cuda_pool_address_map_;
+}
+
+SharedMemoryManager::SharedMemoryManager(
+    const std::string& shm_region_name, size_t shm_size,
+    size_t shm_growth_bytes, bool create)
+{
+  shm_region_name_ = shm_region_name;
+  create_ = create;
   shm_growth_bytes_ = shm_growth_bytes;
+  cuda_memory_pool_manager_ = std::make_unique<CUDAMemoryPoolManager>();
+
   try {
-    shm_obj_.truncate(default_byte_size);
+    if (create) {
+      // Remove (if any) and create the region.
+      bi::shared_memory_object::remove(shm_region_name.c_str());
+      shm_obj_ = std::make_unique<bi::shared_memory_object>(
+          bi::create_only, shm_region_name.c_str(), bi::read_write);
+      shm_obj_->truncate(shm_size);
+    } else {
+      // Open the existing region.
+      shm_obj_ = std::make_unique<bi::shared_memory_object>(
+          bi::open_only, shm_region_name.c_str(), bi::read_write);
+    }
+
+    current_capacity_ = shm_size;
+    shm_map_ = std::make_shared<bi::mapped_region>(*shm_obj_, bi::read_write);
+    old_shm_maps_.push_back(shm_map_);
+
+    // Only create the managed external buffer for the stub process.
+    if (create) {
+      managed_buffer_ = std::make_unique<bi::managed_external_buffer>(
+          bi::create_only, shm_map_->get_address(), shm_size);
+    } else {
+      int64_t shm_size = 0;
+      shm_obj_->get_size(shm_size);
+      managed_buffer_ = std::make_unique<bi::managed_external_buffer>(
+          bi::open_only, shm_map_->get_address(), shm_size);
+      current_capacity_ = shm_size;
+    }
   }
   catch (bi::interprocess_exception& ex) {
     std::string error_message =
-        ("Unable to initialize shared memory key '" + shm_key +
-         "' to requested size (" + std::to_string(default_byte_size) +
+        ("Unable to initialize shared memory key '" + shm_region_name +
+         "' to requested size (" + std::to_string(shm_size) +
          " bytes). If you are running Triton inside docker, use '--shm-size' "
          "flag to control the shared memory region size. Each Python backend "
-         "model instance requires at least 64MBs of shared memory. Flag "
-         "'--shm-size=5G' should be sufficient for common usecases. Error: " +
+         "model instance requires at least 1 MB of shared memory. Error: " +
          ex.what());
+    // Remove the shared memory region if there was an error.
+    bi::shared_memory_object::remove(shm_region_name.c_str());
     throw PythonBackendException(std::move(error_message));
   }
 
-  shm_map_ = std::make_unique<bi::mapped_region>(shm_obj_, bi::read_write);
-  shm_addr_ = (char*)shm_map_->get_address();
-
-  capacity_ = (size_t*)shm_addr_;
-  *capacity_ = default_byte_size;
-  current_capacity_ = *capacity_;
-
-  // Set offset address
-  offset_ = (off_t*)((char*)shm_addr_ + sizeof(size_t));
-
-  *offset_ = 0;
-  *offset_ += sizeof(off_t);
-  *offset_ += sizeof(size_t);
-
-  shm_key_ = shm_key;
+  // Construct a mutex in shared memory.
+  shm_mutex_ =
+      managed_buffer_->find_or_construct<bi::interprocess_mutex>("shm_mutex")();
+  total_size_ = managed_buffer_->find_or_construct<uint64_t>("total size")();
+  delete_region_ = true;
+  if (create) {
+    *total_size_ = current_capacity_;
+    new (shm_mutex_) bi::interprocess_mutex;
+  }
 }
 
-SharedMemory::~SharedMemory() noexcept(false)
+SharedMemoryManager::SharedMemoryManager(const std::string& shm_region_name)
 {
-  bi::shared_memory_object::remove(shm_key_.c_str());
+  shm_region_name_ = shm_region_name;
+  create_ = false;
+  shm_growth_bytes_ = 1024;
+  cuda_memory_pool_manager_ = std::make_unique<CUDAMemoryPoolManager>();
+
+  shm_obj_ = std::make_unique<bi::shared_memory_object>(
+      bi::open_only, shm_region_name.c_str(), bi::read_write);
+
+  shm_map_ = std::make_shared<bi::mapped_region>(*shm_obj_, bi::read_write);
+  old_shm_maps_.push_back(shm_map_);
+
+  int64_t shm_size = 0;
+  shm_obj_->get_size(shm_size);
+  managed_buffer_ = std::make_unique<bi::managed_external_buffer>(
+      bi::open_only, shm_map_->get_address(), shm_size);
+  current_capacity_ = shm_size;
+
+  // Construct a mutex in shared memory.
+  shm_mutex_ =
+      managed_buffer_->find_or_construct<bi::interprocess_mutex>("shm_mutex")();
+  total_size_ = managed_buffer_->find_or_construct<uint64_t>("total size")();
+  delete_region_ = false;
 }
 
 void
-SharedMemory::Map(char** shm_addr, size_t byte_size, off_t& offset)
+SharedMemoryManager::GrowIfNeeded(uint64_t byte_size)
 {
-  size_t shm_bytes_added = 0;
-  while (*offset_ + byte_size >= *capacity_) {
-    // Increase the shared memory pool size by the amount of bytes available.
-    *capacity_ += shm_growth_bytes_;
-    shm_bytes_added += shm_growth_bytes_;
+  if (*total_size_ != current_capacity_) {
+    shm_map_ = std::make_shared<bi::mapped_region>(*shm_obj_, bi::read_write);
+    managed_buffer_ = std::make_unique<bi::managed_external_buffer>(
+        bi::open_only, shm_map_->get_address(), *total_size_);
+    old_shm_maps_.push_back(shm_map_);
+    current_capacity_ = *total_size_;
   }
 
-  if (shm_bytes_added > 0) {
+  if (byte_size != 0) {
+    uint64_t bytes_to_be_added =
+        shm_growth_bytes_ * (byte_size / shm_growth_bytes_ + 1);
+    uint64_t new_size = *total_size_ + bytes_to_be_added;
     try {
-      shm_obj_.truncate(*capacity_);
+      shm_obj_->truncate(new_size);
     }
     catch (bi::interprocess_exception& ex) {
-      *capacity_ -= shm_bytes_added;
       std::string error_message =
-          ("Failed to increase the shared memory pool size for key '" +
-           shm_key_ + "' to " + std::to_string(*capacity_) +
+          ("Failed to increase the shared memory pool size to " +
+           std::to_string(*total_size_) +
            " bytes. If you are running Triton inside docker, use '--shm-size' "
            "flag to control the shared memory region size. Error: " +
            ex.what());
       throw PythonBackendException(error_message);
     }
-  }
-
-  UpdateSharedMemory();
-
-  *shm_addr = shm_addr_ + *offset_;
-  offset = *offset_;
-
-  *offset_ += byte_size;
-}
 
-void
-SharedMemory::UpdateSharedMemory()
-{
-  if (current_capacity_ != *capacity_) {
-    std::unique_ptr<bi::mapped_region> new_map;
     try {
-      new_map = std::make_unique<bi::mapped_region>(shm_obj_, bi::read_write);
+      shm_obj_->truncate(new_size);
+      shm_map_ = std::make_shared<bi::mapped_region>(*shm_obj_, bi::read_write);
+      old_shm_maps_.push_back(shm_map_);
+      managed_buffer_ = std::make_unique<bi::managed_external_buffer>(
+          bi::open_only, shm_map_->get_address(), new_size);
+      managed_buffer_->grow(new_size - current_capacity_);
+      current_capacity_ = managed_buffer_->get_size();
+      *total_size_ = new_size;
     }
     catch (bi::interprocess_exception& ex) {
-      std::string error_message = std::string(
-                                      "unable to process address space or "
-                                      "shared-memory descriptor, err:") +
-                                  ex.what();
+      shm_obj_->truncate(*total_size_);
+      std::string error_message =
+          ("Failed to create new mapped region for the grown shared memory "
+           "region '" +
+           shm_region_name_ + "'. " + ex.what());
       throw PythonBackendException(error_message);
     }
-
-    old_shm_maps_.emplace_back(std::move(shm_map_));
-    current_capacity_ = *capacity_;
-    shm_map_ = std::move(new_map);
-    shm_addr_ = (char*)shm_map_->get_address();
   }
 }
 
-void
-SharedMemory::MapOffset(char** shm_addr, size_t byte_size, off_t offset)
+size_t
+SharedMemoryManager::FreeMemory()
 {
-  // Update shared memory pointer and capacity if necessary.
-  UpdateSharedMemory();
-  *shm_addr = shm_addr_ + offset;
+  GrowIfNeeded(0);
+  return managed_buffer_->get_free_memory();
+}
+
+
+SharedMemoryManager::~SharedMemoryManager() noexcept(false)
+{
+  if (delete_region_) {
+    bi::shared_memory_object::remove(shm_region_name_.c_str());
+  }
 }
 
 void
-SharedMemory::SetOffset(off_t offset)
+SharedMemoryManager::SetDeleteRegion(bool delete_region)
 {
-  *offset_ = offset;
+  delete_region_ = delete_region;
 }
 
 }}}  // namespace triton::backend::python
diff --git a/src/shm_manager.h b/src/shm_manager.h
index 6ee31212..e0799a07 100644
--- a/src/shm_manager.h
+++ b/src/shm_manager.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -26,46 +26,226 @@
 
 #pragma once
 
-#include <unistd.h>
-#include <boost/interprocess/mapped_region.hpp>
-#include <boost/interprocess/shared_memory_object.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/detail/atomic.hpp>
+#include <boost/interprocess/managed_external_buffer.hpp>
+#include <functional>
+#include <iostream>
 #include <memory>
-#include <string>
-#include <utility>
+#include <mutex>
+#include <type_traits>
+#include <typeinfo>
+#include <unordered_map>
 #include <vector>
 
+#include "pb_exception.h"
 
 namespace triton { namespace backend { namespace python {
+namespace bi = boost::interprocess;
 
-class SharedMemory {
-  std::string shm_key_;
-  size_t* capacity_;
-  off_t* offset_;
-  char* shm_addr_;
+class CUDAMemoryPoolManager {
+ public:
+  CUDAMemoryPoolManager() : triton_memory_manager_(nullptr) {}
 
-  // Current capcity, local to each process.
-  size_t current_capacity_;
+  void SetCUDAPoolAddress(const int32_t device_id, void* cuda_pool_address);
 
-  // Amount of bytes to grow the shared memory when the pool is completely used.
-  int64_t shm_growth_bytes_;
+  void* CUDAPoolAddress(const int32_t device_id);
 
-  // Get the amount of shared memory available.
-  size_t GetAvailableSharedMemory();
-  boost::interprocess::shared_memory_object shm_obj_;
-  std::unique_ptr<boost::interprocess::mapped_region> shm_map_;
-  std::vector<std::unique_ptr<boost::interprocess::mapped_region>>
-      old_shm_maps_;
+  void SetTritonMemoryManager(void* triton_memory_manager);
 
-  void UpdateSharedMemory();
+  void* TritonMemoryManager();
 
- public:
-  SharedMemory(
-      const std::string& shm_key, int64_t default_byte_size,
-      int64_t shm_growth_bytes, bool truncate = false);
-  void MapOffset(char** shm_addr, size_t byte_size, off_t offset);
-  void Map(char** shm_addr, size_t byte_size, off_t& offset);
-  void SetOffset(off_t offset);
-  ~SharedMemory() noexcept(false);
+  bool UseCudaSharedPool(const int32_t device_id);
+
+  // Return cuda pool address map
+  std::unordered_map<int32_t, void*>& CUDAPoolAddressMap();
+
+ private:
+  // The base address of the Triton CUDA memory pool
+  std::unordered_map<int32_t, void*> cuda_pool_address_map_;
+  // The mutex to protect the cuda_pool_address_map_
+  std::mutex mu_;
+  // TRITONBACKEND_MemoryManager
+  void* triton_memory_manager_;
+};
+
+template <typename T>
+struct AllocatedSharedMemory {
+  AllocatedSharedMemory() = default;
+  AllocatedSharedMemory(
+      std::unique_ptr<T, std::function<void(T*)>>& data,
+      bi::managed_external_buffer::handle_t handle)
+      : data_(std::move(data)), handle_(handle)
+  {
+  }
+
+  std::unique_ptr<T, std::function<void(T*)>> data_;
+  bi::managed_external_buffer::handle_t handle_;
+};
+
+// The alignment here is used to extend the size of the shared memory allocation
+// struct to 16 bytes. The reason for this change is that when an aligned shared
+// memory location is requested using the `Construct` method, the memory
+// alignment of the object will be incorrect since the shared memory ownership
+// info is placed in the beginning and the actual object is placed after that
+// (i.e. 4 plus the aligned address is not 16-bytes aligned). The aligned memory
+// is required by semaphore otherwise it may lead to SIGBUS error on ARM.
+struct alignas(16) AllocatedShmOwnership {
+  uint32_t ref_count_;
 };
 
+class SharedMemoryManager {
+ public:
+  SharedMemoryManager(
+      const std::string& shm_region_name, size_t shm_size,
+      size_t shm_growth_bytes, bool create);
+
+  SharedMemoryManager(const std::string& shm_region_name);
+
+  template <typename T>
+  AllocatedSharedMemory<T> Construct(uint64_t count = 1, bool aligned = false)
+  {
+    T* obj = nullptr;
+    AllocatedShmOwnership* shm_ownership_data = nullptr;
+    bi::managed_external_buffer::handle_t handle = 0;
+
+    {
+      bi::scoped_lock<bi::interprocess_mutex> guard{*shm_mutex_};
+      std::size_t requested_bytes =
+          sizeof(T) * count + sizeof(AllocatedShmOwnership);
+      GrowIfNeeded(0);
+
+      void* allocated_data;
+      try {
+        allocated_data = Allocate(requested_bytes, aligned);
+      }
+      catch (bi::bad_alloc& ex) {
+        // Try to grow the shared memory region if the allocate failed.
+        GrowIfNeeded(requested_bytes);
+        allocated_data = Allocate(requested_bytes, aligned);
+      }
+
+      shm_ownership_data =
+          reinterpret_cast<AllocatedShmOwnership*>(allocated_data);
+      obj = reinterpret_cast<T*>(
+          (reinterpret_cast<char*>(shm_ownership_data)) +
+          sizeof(AllocatedShmOwnership));
+      shm_ownership_data->ref_count_ = 1;
+
+      handle = managed_buffer_->get_handle_from_address(
+          reinterpret_cast<void*>(shm_ownership_data));
+    }
+
+    return WrapObjectInUniquePtr(obj, shm_ownership_data, handle);
+  }
+
+  template <typename T>
+  AllocatedSharedMemory<T> Load(
+      bi::managed_external_buffer::handle_t handle, bool unsafe = false)
+  {
+    T* object_ptr;
+    AllocatedShmOwnership* shm_ownership_data;
+
+    {
+      bi::scoped_lock<bi::interprocess_mutex> guard{*shm_mutex_};
+      GrowIfNeeded(0);
+      shm_ownership_data = reinterpret_cast<AllocatedShmOwnership*>(
+          managed_buffer_->get_address_from_handle(handle));
+      object_ptr = reinterpret_cast<T*>(
+          reinterpret_cast<char*>(shm_ownership_data) +
+          sizeof(AllocatedShmOwnership));
+      if (!unsafe) {
+        shm_ownership_data->ref_count_ += 1;
+      }
+    }
+
+    return WrapObjectInUniquePtr(object_ptr, shm_ownership_data, handle);
+  }
+
+  size_t FreeMemory();
+
+  void Deallocate(bi::managed_external_buffer::handle_t handle)
+  {
+    bi::scoped_lock<bi::interprocess_mutex> guard{*shm_mutex_};
+    GrowIfNeeded(0);
+    void* ptr = managed_buffer_->get_address_from_handle(handle);
+    managed_buffer_->deallocate(ptr);
+  }
+
+  void DeallocateUnsafe(bi::managed_external_buffer::handle_t handle)
+  {
+    void* ptr = managed_buffer_->get_address_from_handle(handle);
+    managed_buffer_->deallocate(ptr);
+  }
+
+  void GrowIfNeeded(uint64_t bytes);
+  bi::interprocess_mutex* Mutex() { return shm_mutex_; }
+
+  void SetDeleteRegion(bool delete_region);
+
+  std::unique_ptr<CUDAMemoryPoolManager>& GetCUDAMemoryPoolManager()
+  {
+    return cuda_memory_pool_manager_;
+  }
+
+  uint64_t GetCurrentCapacity() { return current_capacity_; }
+  void* GetBaseAddress() { return managed_buffer_->get_address(); }
+
+  ~SharedMemoryManager() noexcept(false);
+
+ private:
+  std::string shm_region_name_;
+  std::unique_ptr<bi::managed_external_buffer> managed_buffer_;
+  std::unique_ptr<bi::shared_memory_object> shm_obj_;
+  std::shared_ptr<bi::mapped_region> shm_map_;
+  std::vector<std::shared_ptr<bi::mapped_region>> old_shm_maps_;
+  uint64_t current_capacity_;
+  bi::interprocess_mutex* shm_mutex_;
+  size_t shm_growth_bytes_;
+  uint64_t* total_size_;
+  bool create_;
+  bool delete_region_;
+  std::unique_ptr<CUDAMemoryPoolManager> cuda_memory_pool_manager_;
+
+  template <typename T>
+  AllocatedSharedMemory<T> WrapObjectInUniquePtr(
+      T* object, AllocatedShmOwnership* shm_ownership_data,
+      const bi::managed_external_buffer::handle_t& handle)
+  {
+    // Custom deleter to conditionally deallocate the object
+    std::function<void(T*)> deleter = [this, handle,
+                                       shm_ownership_data](T* memory) {
+      bool destroy = false;
+      bi::scoped_lock<bi::interprocess_mutex> guard{*shm_mutex_};
+      // Before using any shared memory function you need to make sure that you
+      // are using the correct mapping. For example, shared memory growth may
+      // happen between the time an object was created and the time the object
+      // gets destructed.
+      GrowIfNeeded(0);
+      shm_ownership_data->ref_count_ -= 1;
+      if (shm_ownership_data->ref_count_ == 0) {
+        destroy = true;
+      }
+      if (destroy) {
+        DeallocateUnsafe(handle);
+      }
+    };
+
+    auto data = std::unique_ptr<T, decltype(deleter)>(object, deleter);
+    return AllocatedSharedMemory<T>(data, handle);
+  }
+
+  void* Allocate(uint64_t requested_bytes, bool aligned)
+  {
+    void* ptr;
+    if (aligned) {
+      const std::size_t alignment = 32;
+      ptr = managed_buffer_->allocate_aligned(requested_bytes, alignment);
+    } else {
+      ptr = managed_buffer_->allocate(requested_bytes);
+    }
+
+    return ptr;
+  }
+};
 }}}  // namespace triton::backend::python
diff --git a/src/shm_monitor/CMakeLists.txt b/src/shm_monitor/CMakeLists.txt
new file mode 100644
index 00000000..2ae8bd45
--- /dev/null
+++ b/src/shm_monitor/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required (VERSION 3.31.8)
+
+pybind11_add_module(
+  triton-shm-monitor
+  EXCLUDE_FROM_ALL
+  ./shm_monitor.cc
+  ../shm_manager.h
+  ../shm_manager.cc
+)
+
+target_link_libraries(
+  triton-shm-monitor
+  PRIVATE
+    -lrt # shared memory
+)
+
+set_property(TARGET triton-shm-monitor PROPERTY OUTPUT_NAME triton_shm_monitor)
+
+install(
+  TARGETS
+    triton-shm-monitor
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/python OPTIONAL
+)
diff --git a/src/shm_monitor/shm_monitor.cc b/src/shm_monitor/shm_monitor.cc
new file mode 100644
index 00000000..e0c08d3c
--- /dev/null
+++ b/src/shm_monitor/shm_monitor.cc
@@ -0,0 +1,41 @@
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <pybind11/pybind11.h>
+
+#include "../shm_manager.h"
+
+namespace triton { namespace backend { namespace python {
+namespace py = pybind11;
+
+PYBIND11_MODULE(triton_shm_monitor, m)
+{
+  py::class_<SharedMemoryManager>(m, "SharedMemoryManager")
+      .def(py::init<const std::string&>())
+      .def("free_memory", &SharedMemoryManager::FreeMemory);
+}
+
+}}}  // namespace triton::backend::python
diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc
new file mode 100644
index 00000000..32f5d1bd
--- /dev/null
+++ b/src/stub_launcher.cc
@@ -0,0 +1,1006 @@
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "stub_launcher.h"
+
+#include <filesystem>
+
+#include "pb_utils.h"
+#include "python_be.h"
+
+#ifdef _WIN32
+#include <process.h>  // getpid()
+#endif
+
+extern char** environ;
+
+namespace triton { namespace backend { namespace python {
+
+StubLauncher::StubLauncher(const std::string stub_process_kind)
+    : parent_pid_(0), is_initialized_(false),
+      stub_process_kind_(stub_process_kind), model_instance_name_(""),
+      device_id_(0), kind_("")
+{
+}
+
+StubLauncher::StubLauncher(
+    const std::string stub_process_kind, const std::string model_instance_name,
+    const int32_t device_id, const std::string kind)
+    : is_initialized_(false), stub_process_kind_(stub_process_kind),
+      model_instance_name_(model_instance_name), device_id_(device_id),
+      kind_(kind)
+{
+}
+
+TRITONSERVER_Error*
+StubLauncher::Initialize(ModelState* model_state)
+{
+  model_name_ = model_state->Name();
+  shm_default_byte_size_ =
+      model_state->StateForBackend()->shm_default_byte_size;
+  shm_growth_byte_size_ = model_state->StateForBackend()->shm_growth_byte_size;
+  shm_message_queue_size_ =
+      model_state->StateForBackend()->shm_message_queue_size;
+  python_execution_env_ = model_state->PythonExecutionEnv();
+  python_lib_ = model_state->StateForBackend()->python_lib;
+  model_state->ModelConfig().Write(&model_config_buffer_);
+  is_decoupled_ = model_state->IsDecoupled();
+  model_repository_path_ = model_state->RepositoryPath();
+  runtime_modeldir_ = model_state->RuntimeModelDir();
+  if (runtime_modeldir_.empty()) {
+    runtime_modeldir_ = "DEFAULT";
+  }
+#ifdef _WIN32
+  ZeroMemory(&startup_info_, sizeof(startup_info_));
+  startup_info_.cb = sizeof(startup_info_);
+  ZeroMemory(&stub_pid_, sizeof(stub_pid_));
+#else
+  stub_pid_ = 0;
+#endif
+
+  shm_region_name_ =
+      model_state->StateForBackend()->shared_memory_region_prefix +
+      GenerateUUID();
+
+  model_version_ = model_state->Version();
+
+  std::stringstream ss;
+  const char os_slash = std::filesystem::path::preferred_separator;
+  ss << model_repository_path_ << os_slash << model_version_ << os_slash;
+  std::string artifact_name;
+  RETURN_IF_ERROR(model_state->ModelConfig().MemberAsString(
+      "default_model_filename", &artifact_name));
+  if (artifact_name.size() > 0) {
+    ss << artifact_name;
+  } else {
+    // Default artifact name.
+    ss << "model.py";
+  }
+
+  model_path_ = ss.str();
+
+  // FIXME [DLIS-5969]: Enable for Windows when custom execution environments
+  // are supported.
+  if (python_execution_env_ != "") {
+#ifndef _WIN32
+    RETURN_IF_ERROR(GetPythonEnvironment(model_state));
+#else
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_UNSUPPORTED,
+        "Custom execution environments are not currently supported on "
+        "Windows.");
+#endif
+  }
+
+
+  parent_pid_ = getpid();
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+StubLauncher::Setup()
+{
+  // Destruct any in-use shared memory object before starting the stub process.
+  ipc_control_ = nullptr;
+  stub_message_queue_ = nullptr;
+  parent_message_queue_ = nullptr;
+  stub_to_parent_mq_ = nullptr;
+  parent_to_stub_mq_ = nullptr;
+  memory_manager_ = nullptr;
+
+  try {
+    // It is necessary for restart to make sure that the previous shared memory
+    // pool is destructed before the new pool is created.
+    shm_pool_ = nullptr;
+    shm_pool_ = std::make_unique<SharedMemoryManager>(
+        shm_region_name_, shm_default_byte_size_, shm_growth_byte_size_,
+        true /* create */);
+  }
+  catch (const PythonBackendException& pb_exception) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL, pb_exception.what());
+  }
+
+  AllocatedSharedMemory<IPCControlShm> current_ipc_control =
+      shm_pool_->Construct<IPCControlShm>();
+  ipc_control_ = std::move(current_ipc_control.data_);
+  ipc_control_handle_ = current_ipc_control.handle_;
+
+  RETURN_IF_EXCEPTION(
+      stub_message_queue_ =
+          MessageQueue<bi::managed_external_buffer::handle_t>::Create(
+              shm_pool_, shm_message_queue_size_));
+  RETURN_IF_EXCEPTION(
+      parent_message_queue_ =
+          MessageQueue<bi::managed_external_buffer::handle_t>::Create(
+              shm_pool_, shm_message_queue_size_));
+  RETURN_IF_EXCEPTION(
+      stub_to_parent_mq_ =
+          MessageQueue<bi::managed_external_buffer::handle_t>::Create(
+              shm_pool_, shm_message_queue_size_));
+  RETURN_IF_EXCEPTION(
+      parent_to_stub_mq_ =
+          MessageQueue<bi::managed_external_buffer::handle_t>::Create(
+              shm_pool_, shm_message_queue_size_));
+
+  std::unique_ptr<MessageQueue<intptr_t>> memory_manager_message_queue;
+  RETURN_IF_EXCEPTION(
+      memory_manager_message_queue =
+          MessageQueue<intptr_t>::Create(shm_pool_, shm_message_queue_size_));
+
+  memory_manager_message_queue->ResetSemaphores();
+  ipc_control_->memory_manager_message_queue =
+      memory_manager_message_queue->ShmHandle();
+  ipc_control_->decoupled = is_decoupled_;
+
+  memory_manager_ =
+      std::make_unique<MemoryManager>(std::move(memory_manager_message_queue));
+  ipc_control_->parent_message_queue = parent_message_queue_->ShmHandle();
+  ipc_control_->stub_to_parent_mq = stub_to_parent_mq_->ShmHandle();
+  ipc_control_->stub_message_queue = stub_message_queue_->ShmHandle();
+  ipc_control_->parent_to_stub_mq = parent_to_stub_mq_->ShmHandle();
+
+  new (&(ipc_control_->stub_health_mutex)) bi::interprocess_mutex;
+  health_mutex_ = &(ipc_control_->stub_health_mutex);
+
+  stub_message_queue_->ResetSemaphores();
+  parent_message_queue_->ResetSemaphores();
+  stub_to_parent_mq_->ResetSemaphores();
+  parent_to_stub_mq_->ResetSemaphores();
+
+  is_initialized_ = false;
+
+  return nullptr;
+}
+
+// FIXME: This should be merged with the Unix launch function once Windows
+// CI and functionality are demonstrably stable. The goal of keeping the
+// functions separate is to help debug Windows-specific issues without worrying
+// about the impact to our Unix builds.
+#ifdef _WIN32
+TRITONSERVER_Error*
+StubLauncher::Launch()
+{
+  std::string stub_name;
+  if (stub_process_kind_ == "AUTOCOMPLETE_STUB") {
+    stub_name = model_name_;
+  } else {
+    stub_name = model_instance_name_;
+  }
+
+  const char os_slash = std::filesystem::path::preferred_separator;
+
+  const std::string stub_executable_name = "triton_python_backend_stub.exe";
+  SanitizePath(model_path_);
+  SanitizePath(model_repository_path_);
+
+  // Default Python backend stub
+  std::string python_backend_stub =
+      python_lib_ + os_slash + stub_executable_name;
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("Stub path ") + python_backend_stub).c_str());
+
+  // Path to alternative Python backend stub
+  std::string model_python_backend_stub =
+      std::string(model_repository_path_) + os_slash + stub_executable_name;
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("Alt path ") + python_backend_stub).c_str());
+
+  // Check if file exists
+  // TODO: Integrate win32 and pb_env
+  if (FileExists(model_python_backend_stub)) {
+    python_backend_stub = model_python_backend_stub;
+  }
+
+  std::string launch_command;
+
+  std::stringstream ss;
+  ss << python_backend_stub << " " << model_path_ << " " << shm_region_name_
+     << " " << shm_default_byte_size_ << " " << shm_growth_byte_size_ << " "
+     << parent_pid_ << " " << python_lib_ << " " << ipc_control_handle_ << " "
+     << stub_name << " " << runtime_modeldir_;
+  launch_command = ss.str();
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("Starting Python backend stub: ") + launch_command).c_str());
+
+  LPSTR launch_command_lpstr = const_cast<char*>(launch_command.c_str());
+  // Start the child process. Unlike fork(), the remainder of this
+  // function exists in the context of the parent, only.
+  if (!CreateProcess(
+          NULL,                  // No module name (use command line)
+          launch_command_lpstr,  // Command line
+          NULL,                  // Process handle not inheritable
+          NULL,                  // Thread handle not inheritable
+          FALSE,                 // Set handle inheritance to FALSE
+          0,                     // No creation flags
+          NULL,                  // Use parent's environment block
+          NULL,                  // Use parent's starting directory
+          &startup_info_,        // Pointer to STARTUPINFO structure
+          &stub_pid_)            // Pointer to PROCESS_INFORMATION structure
+  ) {
+    std::stringstream ss;
+    ss << "Failed to run python backend stub. Errno = " << errno << '\n'
+       << "Python backend stub path: " << python_backend_stub << '\n'
+       << "Shared Memory Region Name: " << shm_region_name_ << '\n'
+       << "Shared Memory Default Byte Size: " << shm_default_byte_size_ << '\n'
+       << "Shared Memory Growth Byte Size: " << shm_growth_byte_size_ << '\n';
+    // Print the error message directly because the underlying mutexes in
+    // LOG_MESSAGE() could be forked when it is locked by other thread(s).
+    std::cerr << '\n' << ss.str() << '\n';
+    _Exit(1);
+  }
+  ScopedDefer _([&] {
+    // Push a dummy message to the message queue so that the stub
+    // process is notified that it can release the object stored in
+    // shared memory.
+    if (stub_message_queue_) {
+      stub_message_queue_->Push(DUMMY_MESSAGE);
+    }
+
+    // If the model is not initialized, wait for the stub process to exit.
+    if (!is_initialized_) {
+      stub_message_queue_.reset();
+      parent_message_queue_.reset();
+      memory_manager_.reset();
+      WaitForStubProcess();
+    }
+  });
+
+  // The stub process would send two messages to the parent process during the
+  // initialization.
+  // 1. When the stub process's health monitoring thread has started.
+  // 2. When the initialization is fully completed and the Python model is
+  // loaded.
+  //
+  // The reason it is broken into two steps is that creation of the health
+  // monitoring thread may take longer which can make the server process think
+  // that the stub process is unhealthy and return early. Waiting with a longer
+  // timeout prevents this issue.
+  const uint64_t initialization_timeout_ms = 10000;  // 10 sec
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      "Waiting for the stub health monitoring thread to start");
+
+  bi::managed_external_buffer::handle_t message;
+  auto err = ReceiveMessageFromStub(message, initialization_timeout_ms);
+  if (err != nullptr) {
+    KillStubProcess();
+  }
+
+  if (stub_process_kind_ == "AUTOCOMPLETE_STUB") {
+    if (err != nullptr) {
+      throw BackendModelException(err);
+    }
+    try {
+      AutocompleteStubProcess();
+    }
+    catch (const PythonBackendException& ex) {
+      // Need to kill the stub process first
+      KillStubProcess();
+      throw BackendModelException(
+          TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what()));
+    }
+  } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") {
+    RETURN_IF_ERROR(err);
+    RETURN_IF_ERROR(ModelInstanceStubProcess());
+  } else {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string("Unknown stub_process_kind: ") + stub_process_kind_)
+            .c_str());
+  }
+
+  is_initialized_ = true;
+
+  return nullptr;
+}
+#else
+TRITONSERVER_Error*
+StubLauncher::Launch()
+{
+  std::string stub_name;
+  if (stub_process_kind_ == "AUTOCOMPLETE_STUB") {
+    stub_name = model_name_;
+  } else {
+    stub_name = model_instance_name_;
+  }
+
+  if (!IsValidIdentifier(stub_name)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        "Invalid stub name: contains invalid characters");
+  }
+
+  if (!IsValidIdentifier(shm_region_name_)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        "Invalid shared memory region name: contains invalid characters");
+  }
+
+  // Default Python backend stub
+  std::string python_backend_stub = python_lib_ + "/triton_python_backend_stub";
+
+  // Path to alternative Python backend stub
+  std::string model_python_backend_stub =
+      std::string(model_repository_path_) + "/triton_python_backend_stub";
+
+  if (FileExists(model_python_backend_stub)) {
+    python_backend_stub = model_python_backend_stub;
+  }
+
+  if (!IsExecutableFile(python_backend_stub)) {
+    // Give the execute permission for the triton_python_backend_stub to the
+    // owner.
+    int error = chmod(python_backend_stub.c_str(), S_IXUSR);
+    if (error != 0) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("Failed to give execute permission to "
+                       "triton_python_backend_stub in ") +
+           python_backend_stub + " " + stub_name +
+           " Error No.: " + std::to_string(error))
+              .c_str());
+    }
+  }
+
+  // Prepare arguments for execution
+  std::vector<std::string> arg_strings;
+  std::vector<const char*> exec_args;
+
+  // This shared memory variable indicates whether the stub process should
+  // revert the LD_LIBRARY_PATH changes to avoid shared library issues in
+  // executables and libraries.
+  ipc_control_->uses_env = false;
+
+  if (python_execution_env_ != "") {
+    ipc_control_->uses_env = true;
+
+    // Parse environment variables from activation script
+    std::map<std::string, std::string> env_vars =
+        ParseActivationScript(path_to_activate_);
+
+    // Prepare environment with additional library path
+    auto [env_strings, custom_env] =
+        PrepareEnvironment(env_vars, path_to_libpython_);
+
+    // Set up arguments for direct execution
+    arg_strings.push_back(python_backend_stub);
+    arg_strings.push_back(model_path_);
+    arg_strings.push_back(shm_region_name_);
+    arg_strings.push_back(std::to_string(shm_default_byte_size_));
+    arg_strings.push_back(std::to_string(shm_growth_byte_size_));
+    arg_strings.push_back(std::to_string(parent_pid_));
+    arg_strings.push_back(python_lib_);
+    arg_strings.push_back(std::to_string(ipc_control_handle_));
+    arg_strings.push_back(stub_name);
+    arg_strings.push_back(runtime_modeldir_);
+
+    // Convert strings to char* array for exec
+    for (const auto& arg : arg_strings) {
+      exec_args.push_back(arg.c_str());
+    }
+    exec_args.push_back(nullptr);  // exec requires null termination
+
+    // Log the command being executed
+    std::ostringstream log_cmd;
+    for (size_t i = 0; i < arg_strings.size(); ++i) {
+      if (i > 0)
+        log_cmd << " ";
+      log_cmd << "'" << arg_strings[i] << "'";
+    }
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("Starting Python backend stub with custom environment: ") +
+         log_cmd.str())
+            .c_str());
+
+    pid_t pid = fork();
+    if (pid < 0) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          "Failed to fork the stub process for auto-complete.");
+    }
+    if (pid == 0) {
+      // Replace this child process with the new stub process using custom
+      // environment
+      execve(
+          python_backend_stub.c_str(), const_cast<char**>(exec_args.data()),
+          custom_env.data());
+      // execve() never returns if succeeded. Otherwise, an error has occurred.
+      std::stringstream ss;
+      ss << "Failed to run python backend stub with custom environment. Errno "
+            "= "
+         << errno << '\n'
+         << "Python backend stub path: " << python_backend_stub << '\n'
+         << "Activation script: " << path_to_activate_ << '\n'
+         << "Library path: " << path_to_libpython_ << '\n';
+      std::cerr << '\n' << ss.str() << '\n';
+      _Exit(1);
+    } else {
+      stub_pid_ = pid;
+    }
+
+  } else {
+    arg_strings.push_back(python_backend_stub);
+    arg_strings.push_back(model_path_);
+    arg_strings.push_back(shm_region_name_);
+    arg_strings.push_back(std::to_string(shm_default_byte_size_));
+    arg_strings.push_back(std::to_string(shm_growth_byte_size_));
+    arg_strings.push_back(std::to_string(parent_pid_));
+    arg_strings.push_back(python_lib_);
+    arg_strings.push_back(std::to_string(ipc_control_handle_));
+    arg_strings.push_back(stub_name);
+    arg_strings.push_back(runtime_modeldir_);
+
+    // Convert strings to char* array for exec
+    for (const auto& arg : arg_strings) {
+      exec_args.push_back(arg.c_str());
+    }
+    exec_args.push_back(nullptr);  // exec requires null termination
+
+    // Log the command being executed
+    std::ostringstream log_cmd;
+    for (size_t i = 0; i < arg_strings.size(); ++i) {
+      if (i > 0)
+        log_cmd << " ";
+      log_cmd << "'" << arg_strings[i] << "'";
+    }
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("Starting Python backend stub: ") + log_cmd.str())
+            .c_str());
+
+    pid_t pid = fork();
+    if (pid < 0) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          "Failed to fork the stub process for auto-complete.");
+    }
+    if (pid == 0) {
+      // Replace this child process with the new stub process.
+      execv(python_backend_stub.c_str(), const_cast<char**>(exec_args.data()));
+      // execv() never returns if succeeded. Otherwise, an error has occurred.
+      std::stringstream ss;
+      ss << "Failed to run python backend stub. Errno = " << errno << '\n'
+         << "Python backend stub path: " << python_backend_stub << '\n';
+      std::cerr << '\n' << ss.str() << '\n';
+      _Exit(1);
+    } else {
+      stub_pid_ = pid;
+    }
+  }
+
+  ScopedDefer _([&] {
+    // Push a dummy message to the message queue so that the stub
+    // process is notified that it can release the object stored in
+    // shared memory.
+    if (stub_message_queue_) {
+      stub_message_queue_->Push(DUMMY_MESSAGE);
+    }
+
+    // If the model is not initialized, wait for the stub process to exit.
+    if (!is_initialized_) {
+      stub_message_queue_.reset();
+      parent_message_queue_.reset();
+      memory_manager_.reset();
+      WaitForStubProcess();
+    }
+  });
+
+  // The stub process would send two messages to the parent process during the
+  // initialization.
+  // 1. When the stub process's health monitoring thread has started.
+  // 2. When the initialization is fully completed and the Python model is
+  // loaded.
+  //
+  // The reason it is broken into two steps is that creation of the health
+  // monitoring thread may take longer which can make the server process think
+  // that the stub process is unhealthy and return early. Waiting with a
+  // longer timeout prevents this issue.
+  const uint64_t initialization_timeout_ms = 10000;  // 10 sec
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      "Waiting for the stub health monitoring thread to start");
+
+  bi::managed_external_buffer::handle_t message;
+  auto err = ReceiveMessageFromStub(message, initialization_timeout_ms);
+  if (err != nullptr) {
+    KillStubProcess();
+  }
+
+  if (stub_process_kind_ == "AUTOCOMPLETE_STUB") {
+    if (err != nullptr) {
+      throw BackendModelException(err);
+    }
+    try {
+      AutocompleteStubProcess();
+    }
+    catch (const PythonBackendException& ex) {
+      // Need to kill the stub process first
+      KillStubProcess();
+      throw BackendModelException(
+          TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what()));
+    }
+  } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") {
+    RETURN_IF_ERROR(err);
+    RETURN_IF_ERROR(ModelInstanceStubProcess());
+  } else {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string("Unknown stub_process_kind: ") + stub_process_kind_)
+            .c_str());
+  }
+
+  is_initialized_ = true;
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+StubLauncher::GetPythonEnvironment(ModelState* model_state)
+{
+  std::string python_execution_env = "";
+  try {
+    python_execution_env =
+        model_state->StateForBackend()->env_manager->ExtractIfNotExtracted(
+            python_execution_env_);
+  }
+  catch (PythonBackendException& pb_exception) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL, pb_exception.what());
+  }
+
+  path_to_activate_ = python_execution_env + "/bin/activate";
+  path_to_libpython_ = python_execution_env + "/lib";
+  if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        ("Path " + path_to_activate_ +
+         " does not exist. The Python environment should contain an "
+         "'activate' script.")
+            .c_str());
+  }
+  return nullptr;
+}
+#endif
+
+void
+StubLauncher::AutocompleteStubProcess()
+{
+  std::string model_config = model_config_buffer_.MutableContents();
+
+  std::unique_ptr<IPCMessage> auto_complete_message =
+      IPCMessage::Create(shm_pool_, false /* inline_response */);
+  auto_complete_message->Command() = PYTHONSTUB_AutoCompleteRequest;
+
+  std::unique_ptr<PbString> pb_string =
+      PbString::Create(shm_pool_, model_config);
+  bi::managed_external_buffer::handle_t string_handle = pb_string->ShmHandle();
+
+  auto_complete_message->Args() = string_handle;
+  stub_message_queue_->Push(auto_complete_message->ShmHandle());
+
+  std::unique_ptr<IPCMessage> auto_complete_response_message =
+      IPCMessage::LoadFromSharedMemory(shm_pool_, parent_message_queue_->Pop());
+
+  if (auto_complete_response_message->Command() !=
+      PYTHONSTUB_AutoCompleteResponse) {
+    throw PythonBackendException(
+        "Received unexpected response from Python backend stub: " +
+        model_name_);
+  }
+
+  auto auto_complete_response =
+      std::move((shm_pool_->Load<AutoCompleteResponseShm>(
+                    auto_complete_response_message->Args())))
+          .data_;
+
+  if (auto_complete_response->response_has_error) {
+    if (auto_complete_response->response_is_error_set) {
+      std::unique_ptr<PbString> error_message = PbString::LoadFromSharedMemory(
+          shm_pool_, auto_complete_response->response_error);
+      throw PythonBackendException(error_message->String());
+    } else {
+      throw PythonBackendException("Auto-complete failed for " + model_name_);
+    }
+  }
+
+  if (auto_complete_response->response_has_model_config) {
+    std::unique_ptr<PbString> auto_complete_config =
+        PbString::LoadFromSharedMemory(
+            shm_pool_, auto_complete_response->response_model_config);
+    std::string auto_complete_config_string = auto_complete_config->String();
+    if (!auto_complete_config_string.empty()) {
+      TRITONSERVER_Error* err =
+          auto_complete_config_.Parse(auto_complete_config_string);
+      if (err != nullptr) {
+        throw PythonBackendException("Failed to parse auto-complete JSON.");
+      }
+    }
+  }
+}
+
+TRITONSERVER_Error*
+StubLauncher::ModelInstanceStubProcess()
+{
+  std::unordered_map<std::string, std::string> initialize_map = {
+      {"model_config", model_config_buffer_.MutableContents()},
+      {"model_instance_kind", kind_},
+      {"model_instance_name", model_instance_name_},
+      {"model_instance_device_id", std::to_string(device_id_)},
+      {"model_repository", model_repository_path_},
+      {"model_version", std::to_string(model_version_)},
+      {"model_name", model_name_}};
+
+  std::unique_ptr<IPCMessage> initialize_message =
+      IPCMessage::Create(shm_pool_, false /* inline_response */);
+  initialize_message->Command() = PYTHONSTUB_InitializeRequest;
+
+  std::unique_ptr<PbMap> pb_map = PbMap::Create(shm_pool_, initialize_map);
+  bi::managed_external_buffer::handle_t initialize_map_handle =
+      pb_map->ShmHandle();
+
+  initialize_message->Args() = initialize_map_handle;
+  stub_message_queue_->Push(initialize_message->ShmHandle());
+
+  const uint64_t initialization_timeout_ms = 5000;  // 5 sec
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_VERBOSE,
+      "Waiting for the stub process initialization response");
+
+  bi::managed_external_buffer::handle_t message;
+  RETURN_IF_ERROR(ReceiveMessageFromStub(message, initialization_timeout_ms));
+
+  std::unique_ptr<IPCMessage> initialize_response_message =
+      IPCMessage::LoadFromSharedMemory(shm_pool_, message);
+
+  if (initialize_response_message->Command() != PYTHONSTUB_InitializeResponse) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string(
+             "Received unexpected response from Python backend stub: ") +
+         model_instance_name_)
+            .c_str());
+  }
+
+  auto initialize_response =
+      std::move((shm_pool_->Load<InitializeResponseShm>(
+                    initialize_response_message->Args())))
+          .data_;
+
+  if (initialize_response->response_has_error) {
+    if (initialize_response->response_is_error_set) {
+      std::unique_ptr<PbString> error_message = PbString::LoadFromSharedMemory(
+          shm_pool_, initialize_response->response_error);
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL, error_message->String().c_str());
+    } else {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("Launch stub process failed for ") + model_name_)
+              .c_str());
+    }
+  }
+
+  return nullptr;
+}
+
+bool
+StubLauncher::StubActive()
+{
+#ifdef _WIN32
+  DWORD ec;
+  GetExitCodeProcess(stub_pid_.hProcess, &ec);
+  return (ec == STILL_ACTIVE);
+#else
+  return (stub_pid_ != 0);
+#endif
+}
+
+void
+StubLauncher::UpdateHealth()
+{
+  is_healthy_ = false;
+  if (is_initialized_) {
+    {
+      bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_);
+      ipc_control_->stub_health = false;
+    }
+
+// Sleep 1 second so that the child process has a chance to change the
+// health variable
+#ifdef _WIN32
+    Sleep(1);
+#else
+    sleep(1);
+#endif
+
+    {
+      bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_);
+      is_healthy_ = ipc_control_->stub_health;
+    }
+  }
+}
+
+void
+StubLauncher::TerminateStub()
+{
+  if (is_initialized_) {
+    bool force_kill = false;
+    if (is_healthy_) {
+      // Finalize command does not have any arguments.
+      std::unique_ptr<IPCMessage> ipc_message =
+          IPCMessage::Create(shm_pool_, false /* inline_response */);
+
+      ipc_message->Command() = PYTHONSTUB_FinalizeRequest;
+      stub_message_queue_->Push(ipc_message->ShmHandle());
+      parent_message_queue_->Pop();
+
+      stub_message_queue_.reset();
+      parent_message_queue_.reset();
+      memory_manager_.reset();
+    } else {
+      force_kill = true;
+    }
+
+    if (force_kill) {
+      KillStubProcess();
+    } else {
+      WaitForStubProcess();
+    }
+  }
+
+  // First destroy the IPCControl. This makes sure that IPCControl is
+  // destroyed before the shared memory manager goes out of scope.
+  ipc_control_.reset();
+  stub_message_queue_.reset();
+  parent_message_queue_.reset();
+  memory_manager_.reset();
+}
+
+void
+StubLauncher::ClearQueues()
+{
+  stub_to_parent_mq_.reset();
+  parent_to_stub_mq_.reset();
+}
+
+void
+StubLauncher::KillStubProcess()
+{
+#ifdef _WIN32
+  unsigned int exit_code;
+  TerminateProcess(stub_pid_.hProcess, exit_code);
+  CloseHandle(stub_pid_.hProcess);
+  CloseHandle(stub_pid_.hThread);
+#else
+  kill(stub_pid_, SIGKILL);
+  WaitForStubProcess();
+  stub_pid_ = 0;
+#endif
+}
+
+TRITONSERVER_Error*
+StubLauncher::ReceiveMessageFromStub(
+    bi::managed_external_buffer::handle_t& message,
+    uint64_t timeout_miliseconds)
+{
+  bool success = false;
+  while (!success) {
+    {
+      boost::posix_time::ptime timeout =
+          boost::get_system_time() +
+          boost::posix_time::milliseconds(timeout_miliseconds);
+
+      bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_, timeout);
+
+      // Check if lock has been acquired.
+      if (lock) {
+        ipc_control_->stub_health = false;
+      } else {
+        // If it failed to obtain the lock, it means that the stub has been
+        // stuck or exited while holding the health mutex lock.
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex.");
+      }
+    }
+
+    message = parent_message_queue_->Pop(
+        timeout_miliseconds /* duration ms */, success);
+
+    bool is_stub_alive = false;
+    {
+      boost::posix_time::ptime timeout =
+          boost::get_system_time() + boost::posix_time::seconds(1);
+      bi::scoped_lock<bi::interprocess_mutex> lock(*health_mutex_, timeout);
+      if (lock) {
+        is_stub_alive = ipc_control_->stub_health;
+      } else {
+        // If It failed to obtain the lock, it means that the stub has been
+        // stuck or exited while holding the health mutex lock.
+        is_stub_alive = false;
+      }
+    }
+
+    if (!success && !is_stub_alive) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("Stub process '") + model_instance_name_ +
+           "' is not healthy.")
+              .c_str());
+    }
+  }
+
+  return nullptr;  // success
+}
+
+void
+StubLauncher::WaitForStubProcess()
+{
+#ifdef _WIN32
+  WaitForSingleObject(stub_pid_.hProcess, INFINITE);
+  CloseHandle(stub_pid_.hProcess);
+  CloseHandle(stub_pid_.hThread);
+#else
+  int status;
+  if (stub_pid_ != 0) {
+    // Added this check to ensure server doesn't hang waiting after stub
+    // process has already be killed and cannot be waited on
+    waitpid(stub_pid_, &status, 0);
+  }
+#endif
+}
+
+#ifdef TRITON_ENABLE_GPU
+void
+StubLauncher::ShareCUDAMemoryPool(
+    TRITONBACKEND_MemoryManager* triton_mem_manager, const int32_t device_id)
+{
+  std::lock_guard<std::mutex> lock(cuda_shm_pool_mutex_);
+  if ((tried_sharing_cuda_pool_map_.find(device_id) !=
+       tried_sharing_cuda_pool_map_.end()) &&
+      tried_sharing_cuda_pool_map_[device_id]) {
+    return;
+  }
+
+  std::unique_ptr<IPCMessage> ipc_message =
+      IPCMessage::Create(shm_pool_, true /* inline_response */);
+  CUDAMemPoolMessage* cuda_pool_message_ptr = nullptr;
+  PythonBackendException pb_exception(std::string{});
+
+  try {
+    // Create a dummy BackendMemory object to get the start address of the CUDA
+    // memory pool.
+    BackendMemory* backend_memory;
+    std::unique_ptr<BackendMemory> lbackend_memory;
+
+    THROW_IF_TRITON_ERROR(BackendMemory::Create(
+        triton_mem_manager, BackendMemory::AllocationType::GPU_POOL, device_id,
+        1 /* byte size*/, &backend_memory));
+    lbackend_memory.reset(backend_memory);
+
+    CUDAHandler& cuda_api = CUDAHandler::getInstance();
+    CUdeviceptr cuda_pool_address = 0;
+    cuda_api.PointerGetAttribute(
+        &cuda_pool_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+        reinterpret_cast<CUdeviceptr>(lbackend_memory->MemoryPtr()));
+
+    shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress(
+        device_id, reinterpret_cast<void*>(cuda_pool_address));
+    shm_pool_->GetCUDAMemoryPoolManager()->SetTritonMemoryManager(
+        reinterpret_cast<void*>(triton_mem_manager));
+
+    // Get the memory handle from the CUDA memory pool.
+    AllocatedSharedMemory<CUDAMemPoolMessage> cuda_pool_message =
+        shm_pool_->Construct<CUDAMemPoolMessage>();
+    cuda_pool_message_ptr = cuda_pool_message.data_.get();
+    {
+      ScopedSetDevice scoped_set_device(device_id);
+      THROW_IF_CUDA_ERROR(cudaIpcGetMemHandle(
+          reinterpret_cast<cudaIpcMemHandle_t*>(
+              &cuda_pool_message_ptr->cuda_handle),
+          reinterpret_cast<char*>(shm_pool_->GetCUDAMemoryPoolManager()
+                                      ->CUDAPoolAddress(device_id))));
+    }
+
+    ipc_message->Command() = PYTHONSTUB_CUDAPoolInitializeRequest;
+    ipc_message->Args() = cuda_pool_message.handle_;
+
+    cuda_pool_message_ptr->device_id = device_id;
+    cuda_pool_message_ptr->has_error = false;
+    cuda_pool_message_ptr->is_error_set = false;
+    cuda_pool_message_ptr->waiting_on_stub = false;
+
+    {
+      bi::scoped_lock<bi::interprocess_mutex> lock{
+          *(ipc_message->ResponseMutex())};
+      parent_to_stub_mq_->Push(ipc_message->ShmHandle());
+      while (!cuda_pool_message_ptr->waiting_on_stub) {
+        ipc_message->ResponseCondition()->wait(lock);
+      }
+    }
+
+    if (cuda_pool_message_ptr->has_error) {
+      if (cuda_pool_message_ptr->is_error_set) {
+        std::unique_ptr<PbString> error_message =
+            PbString::LoadFromSharedMemory(
+                shm_pool_, cuda_pool_message_ptr->error);
+        throw PythonBackendException(error_message->String());
+      } else {
+        throw PythonBackendException(
+            "Failed to share CUDA memory pool with stub process: " +
+            model_name_);
+      }
+    }
+  }
+  catch (const PythonBackendException& exception) {
+    shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress(
+        device_id, nullptr);
+    pb_exception = exception;
+  }
+
+  {
+    bi::scoped_lock<bi::interprocess_mutex> lock{
+        *(ipc_message->ResponseMutex())};
+    cuda_pool_message_ptr->waiting_on_stub = false;
+    ipc_message->ResponseCondition()->notify_all();
+  }
+
+  tried_sharing_cuda_pool_map_[device_id] = true;
+
+  if (pb_exception.what() != std::string{""}) {
+    throw pb_exception;
+  }
+}
+#endif  // TRITON_ENABLE_GPU
+}}};    // namespace triton::backend::python
diff --git a/src/stub_launcher.h b/src/stub_launcher.h
new file mode 100644
index 00000000..58cdcc61
--- /dev/null
+++ b/src/stub_launcher.h
@@ -0,0 +1,222 @@
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <atomic>
+#include <boost/asio.hpp>
+#include <boost/asio/post.hpp>
+#include <boost/asio/thread_pool.hpp>
+#include <boost/functional/hash.hpp>
+#include <boost/interprocess/sync/interprocess_condition.hpp>
+#include <boost/interprocess/sync/interprocess_mutex.hpp>
+#include <boost/interprocess/sync/scoped_lock.hpp>
+#include <boost/thread/thread_time.hpp>
+#include <future>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "ipc_message.h"
+#include "memory_manager.h"
+#include "message_queue.h"
+#include "pb_utils.h"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/core/tritonbackend.h"
+#include "triton/core/tritonserver.h"
+
+namespace triton { namespace backend { namespace python {
+
+class ModelState;
+
+class StubLauncher {
+ public:
+  StubLauncher(const std::string stub_process_kind);
+  StubLauncher(
+      const std::string stub_process_kind,
+      const std::string model_instance_name, const int32_t device_id,
+      const std::string kind);
+
+  // Initialize stub process
+  TRITONSERVER_Error* Initialize(ModelState* model_state);
+
+  // Stub process setup
+  TRITONSERVER_Error* Setup();
+
+  // Launch stub process
+  TRITONSERVER_Error* Launch();
+
+  // Auto-complete stub process
+  void AutocompleteStubProcess();
+
+  // Model instance stub process
+  TRITONSERVER_Error* ModelInstanceStubProcess();
+
+  // Check if Stub PID is active
+  bool StubActive();
+
+  // Health mutex
+  bi::interprocess_mutex* HealthMutex() { return health_mutex_; }
+
+  // Stub message queue
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>&
+  StubMessageQueue()
+  {
+    return stub_message_queue_;
+  }
+
+  // Parent message queue
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>&
+  ParentMessageQueue()
+  {
+    return parent_message_queue_;
+  }
+
+  // Stub to parent message queue
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>&
+  StubToParentMessageQueue()
+  {
+    return stub_to_parent_mq_;
+  }
+
+  // Parent to stub message queue
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>&
+  ParentToStubMessageQueue()
+  {
+    return parent_to_stub_mq_;
+  }
+
+  // Memory Manager
+  std::unique_ptr<MemoryManager>& GetMemoryManager() { return memory_manager_; }
+
+  // IPC control
+  std::unique_ptr<IPCControlShm, std::function<void(IPCControlShm*)>>&
+  IpcControl()
+  {
+    return ipc_control_;
+  }
+
+  // Shared memory pool
+  std::unique_ptr<SharedMemoryManager>& ShmPool() { return shm_pool_; }
+
+  // Get auto-complete model configuration
+  common::TritonJson::Value& AutoCompleteConfig()
+  {
+    return auto_complete_config_;
+  }
+
+  // Update health variable
+  void UpdateHealth();
+
+  // Is Healthy
+  bool IsHealthy() { return is_healthy_; }
+
+  // Destruct Stub process
+  void TerminateStub();
+
+  // Reset log queue and bls decoupled queue pointers
+  void ClearQueues();
+
+  // Kill stub process
+  void KillStubProcess();
+
+  // Get a message from the stub process
+  TRITONSERVER_Error* ReceiveMessageFromStub(
+      bi::managed_external_buffer::handle_t& message,
+      uint64_t timeout_miliseconds = 1000);
+
+  // Wait for stub process
+  void WaitForStubProcess();
+
+#ifndef _WIN32
+  // FIXME [DLIS-5969]: Enable for Windows when custom execution environments
+  // are supported.
+  TRITONSERVER_Error* GetPythonEnvironment(ModelState* model_state);
+#endif
+#ifdef TRITON_ENABLE_GPU
+  // Share CUDA memory pool with stub process
+  void ShareCUDAMemoryPool(
+      TRITONBACKEND_MemoryManager* triton_mem_manager, const int32_t device_id);
+#endif  // TRITON_ENABLE_GPU
+
+ private:
+#ifdef _WIN32
+  STARTUPINFO startup_info_;
+  DWORD parent_pid_;
+  PROCESS_INFORMATION stub_pid_;
+#else
+  pid_t parent_pid_;
+  pid_t stub_pid_;
+#endif
+  bool is_initialized_;
+  bool is_decoupled_;
+  bool is_healthy_;
+  std::string shm_region_name_;
+  std::string model_repository_path_;
+  std::string model_path_;
+  std::string runtime_modeldir_;
+  const std::string stub_process_kind_;
+  std::string model_name_;
+  const std::string model_instance_name_;
+  const int32_t device_id_;
+  const std::string kind_;
+  uint64_t model_version_;
+
+  std::string python_lib_;
+  int64_t shm_default_byte_size_;
+  int64_t shm_growth_byte_size_;
+  int64_t shm_message_queue_size_;
+
+  // Path to python execution environment
+  std::string path_to_libpython_;
+  std::string path_to_activate_;
+  std::string python_execution_env_;
+
+  common::TritonJson::WriteBuffer model_config_buffer_;
+  common::TritonJson::Value auto_complete_config_;
+
+  bi::interprocess_mutex* health_mutex_;
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
+      stub_message_queue_;
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
+      parent_message_queue_;
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
+      stub_to_parent_mq_;
+  std::unique_ptr<MessageQueue<bi::managed_external_buffer::handle_t>>
+      parent_to_stub_mq_;
+  std::unique_ptr<MemoryManager> memory_manager_;
+  std::unique_ptr<IPCControlShm, std::function<void(IPCControlShm*)>>
+      ipc_control_;
+  bi::managed_external_buffer::handle_t ipc_control_handle_;
+  std::unique_ptr<SharedMemoryManager> shm_pool_;
+#ifdef TRITON_ENABLE_GPU
+  std::mutex cuda_shm_pool_mutex_;
+  std::unordered_map<int32_t, bool> tried_sharing_cuda_pool_map_;
+#endif  // TRITON_ENABLE_GPU
+};
+}}}  // namespace triton::backend::python