diff --git a/.clang-format b/.clang-format index 98c6497..1defc17 100644 --- a/.clang-format +++ b/.clang-format @@ -2,6 +2,7 @@ BasedOnStyle: Google IndentWidth: 2 +ColumnLimit: 80 ContinuationIndentWidth: 4 UseTab: Never MaxEmptyLinesToKeep: 2 @@ -34,4 +35,5 @@ BinPackArguments: true BinPackParameters: true ConstructorInitializerAllOnOneLineOrOnePerLine: false -IndentCaseLabels: true \ No newline at end of file +IndentCaseLabels: true + diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..4fa1873 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,38 @@ +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: pre-commit + +on: + pull_request: + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5.0.0 + - uses: actions/setup-python@v6.0.0 + - uses: pre-commit/action@v3.0.1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3c76a6e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,73 @@ +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +repos: +- repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + additional_dependencies: [toml] +- repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + types_or: [python, cython] +- repo: https://github.com/PyCQA/flake8 + rev: 7.3.0 + hooks: + - id: flake8 + args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] + types_or: [python, cython] +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v16.0.5 + hooks: + - id: clang-format + types_or: [c, c++, cuda, proto, textproto, java] + args: ["-fallback-style=none", "-style=file", "-i"] +- repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell + additional_dependencies: [tomli] + args: ["--toml", "pyproject.toml"] + exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) +# More details about these pre-commit hooks here: +# https://pre-commit.com/hooks.html +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-case-conflict + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-json + - id: check-toml + - id: check-yaml + - id: check-shebang-scripts-are-executable + - id: end-of-file-fixer + types_or: [c, c++, cuda, proto, textproto, java, python] + - id: mixed-line-ending + - id: requirements-txt-fixer + - id: trailing-whitespace diff --git a/CMakeLists.txt b/CMakeLists.txt index 076b095..5b0e399 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,10 +24,13 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -cmake_minimum_required (VERSION 3.18) +cmake_minimum_required (VERSION 3.31.8) project(tritonpytorchbackend LANGUAGES C CXX) +# Use C++17 standard as Triton's minimum required. +set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.") + # # Options # @@ -44,13 +47,16 @@ project(tritonpytorchbackend LANGUAGES C CXX) option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) +option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) option(TRITON_PYTORCH_ENABLE_TORCHTRT "Enable TorchTRT support" OFF) option(TRITON_PYTORCH_ENABLE_TORCHVISION "Enable Torchvision support" ON) +option(TRITON_PYTORCH_NVSHMEM "Enable NVSHMEM support" ON) set(TRITON_PYTORCH_DOCKER_IMAGE "" CACHE STRING "Docker image containing the PyTorch build required by backend.") set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes") set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries") +set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") @@ -73,37 +79,54 @@ else() endif() # Look for installed Torchvision package in lib paths - if(TRITON_PYTORCH_ENABLE_TORCHVISION AND NOT EXISTS "${TRITON_PYTORCH_LIB_PATHS}/libtorchvision.so") + find_library( LIBTORCHVISION libtorchvision.so libtorchvision.so.1 PATHS ${TRITON_PYTORCH_LIB_PATHS} ) + if(NOT ${LIBTORCHVISION}) message(WARNING "TRITON_PYTORCH_ENABLE_TORCHVISION is on, but TRITON_PYTORCH_LIB_PATHS does not contain Torchvision package") - endif() + endif(NOT ${LIBTORCHVISION}) endif() # Python.h needed by torch headers. -find_package(Python3 REQUIRED COMPONENTS Development) +find_package(Python3 REQUIRED COMPONENTS Development.Module) + +set(RHEL_BUILD OFF) +set(LIB_DIR "lib") +set(LIBTORCH_LIBS_PATH "/usr/local/lib") +set(PY_INSTALL_PATH "/usr/local/lib/python3.12/dist-packages") +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") + set(RHEL_BUILD ON) + set(LIB_DIR "lib64") + set(PY_INSTALL_PATH "/opt/_internal/cpython-3.12.1/lib/python3.12/site-packages") + if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") + set(LIBTORCH_LIBS_PATH "/opt/_internal/cpython-3.12.1/lib") + endif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) # # Dependencies # -# FetchContent's composibility isn't very good. We must include the +# FetchContent's composability isn't very good. We must include the # transitive closure of all repos so that we can override the tag. # include(FetchContent) FetchContent_Declare( repo-common - GIT_REPOSITORY https://github.com/triton-inference-server/common.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-core - GIT_REPOSITORY https://github.com/triton-inference-server/core.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-backend - GIT_REPOSITORY https://github.com/triton-inference-server/backend.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} GIT_SHALLOW ON ) @@ -120,66 +143,93 @@ else() endif() endif() # TRITON_ENABLE_GPU +if(${TRITON_ENABLE_NVTX}) + add_definitions(-DTRITON_ENABLE_NVTX=1) +endif() # TRITON_ENABLE_NVTX + # # Shared library implementing the Triton Backend API # configure_file(src/libtriton_pytorch.ldscript libtriton_pytorch.ldscript COPYONLY) -if (${TRITON_PYTORCH_DOCKER_BUILD}) - if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") - set(LIBS_ARCH "aarch64") - set(CONDA_LIBS - "libopenblas.so.0" - ) - else() - set(LIBS_ARCH "x86_64") - set(CONDA_LIBS - "libmkl_core.so" - "libmkl_gnu_thread.so" - "libmkl_intel_lp64.so" - "libmkl_intel_thread.so" - "libmkl_def.so" - "libmkl_vml_def.so" - "libmkl_rt.so" - "libmkl_avx2.so" - "libmkl_avx512.so" - "libmkl_sequential.so" - "libomp.so" - ) - endif() +set(PT_LIBS + "libc10.so" + "libc10_cuda.so" + "libtorch.so" + "libtorch_cpu.so" + "libtorch_cuda.so" + "libtorch_cuda_linalg.so" + "libtorch_global_deps.so" + "libjpeg.so.62" +) + +if (${TRITON_PYTORCH_NVSHMEM}) set(PT_LIBS - ${CONDA_LIBS} - "libc10.so" - "libc10_cuda.so" - "libtorch.so" - "libtorch_cpu.so" - "libtorch_cuda.so" - "libtorch_global_deps.so" - "libtorchvision.so" + ${PT_LIBS} + "libtorch_nvshmem.so" ) - set(OPENCV_LIBS - "libopencv_video.so" - "libopencv_videoio.so" - "libopencv_highgui.so" - "libopencv_imgcodecs.so" - "libopencv_imgproc.so" - "libopencv_core.so" - "libpng16.so" +endif() # TRITON_PYTORCH_NVSHMEM + +if (${TRITON_PYTORCH_ENABLE_TORCHVISION}) + set(PT_LIBS + ${PT_LIBS} + $,libtorchvision.so,libtorchvision.so.1> ) +endif() # TRITON_PYTORCH_ENABLE_TORCHVISION - string(REPLACE ";" " " CONDA_LIBS_STR "${CONDA_LIBS}") +if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) + set(PT_LIBS + ${PT_LIBS} + "libtorchtrt_runtime.so" + ) +endif() # TRITON_PYTORCH_ENABLE_TORCHTRT + +if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(LIBS_ARCH "aarch64") + set(LIBTORCH_LIBS + "libnvpl_blas_core.so.0" + "libnvpl_blas_ilp64_gomp.so.0" + "libnvpl_blas_ilp64_seq.so.0" + "libnvpl_blas_lp64_gomp.so.0" + "libnvpl_blas_lp64_seq.so.0" + "libnvpl_lapack_core.so.0" + "libnvpl_lapack_ilp64_gomp.so.0" + "libnvpl_lapack_ilp64_seq.so.0" + "libnvpl_lapack_lp64_gomp.so.0" + "libnvpl_lapack_lp64_seq.so.0" + ) +else() + set(LIBS_ARCH "x86_64") + set(LIBTORCH_LIBS + "libmkl_avx2.so.1" + "libmkl_avx512.so.1" + "libmkl_core.so.1" + "libmkl_def.so.1" + "libmkl_gnu_thread.so.1" + "libmkl_intel_lp64.so.1" + "libmkl_intel_thread.so.1" + "libmkl_rt.so.1" + "libmkl_sequential.so.1" + "libmkl_vml_def.so.1" + ) +endif() +set(TORCHVISION_LIBS + $,libjpeg.so.62,libjpeg.so> + $,libpng16.so.16,libpng16.so> +) - if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) - set(PT_LIBS - ${PT_LIBS} - "libtorchtrt_runtime.so" - ) - endif() # TRITON_PYTORCH_ENABLE_TORCHTRT +# The patchelf commands ensure the MKL libraries are loaded correctly during runtime +# Without these, the framework/backend complains of missing libraries / symbols and +# in some cases leads to segmentation faults. +if (${TRITON_PYTORCH_DOCKER_BUILD}) + string(REPLACE ";" " " LIBTORCH_LIBS_STR "${LIBTORCH_LIBS}") + string(RANDOM 8 "abcdefghijklmnopqrstuvwxyz" random_id) add_custom_command( OUTPUT ${PT_LIBS} - ${OPENCV_LIBS} + ${LIBTORCH_LIBS} + ${TORCHVISION_LIBS} LICENSE.pytorch include/torch include/torchvision @@ -187,38 +237,42 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) COMMAND docker pull ${TRITON_PYTORCH_DOCKER_IMAGE} COMMAND docker rm pytorch_backend_ptlib || echo "error ignored..." || true COMMAND docker create --name pytorch_backend_ptlib ${TRITON_PYTORCH_DOCKER_IMAGE} - COMMAND /bin/sh -c "for i in ${CONDA_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:/opt/conda/lib/$i $i ; done" - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libc10.so libc10.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libc10_cuda.so libc10_cuda.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch.so libtorch.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so libtorch_cpu.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so libtorch_cuda.so - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_global_deps.so libtorch_global_deps.so - COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/build/libtorchvision.so libtorchvision.so - COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi" - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true + COMMAND /bin/sh -c "for i in ${LIBTORCH_LIBS_STR} ; do echo copying $i && docker cp -L pytorch_backend_ptlib:${LIBTORCH_LIBS_PATH}/$i $i ; done" + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10.so libc10.so + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libc10_cuda.so libc10_cuda.so + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch.so libtorch.so + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cpu.so libtorch_cpu.so + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda.so libtorch_cuda.so + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_cuda_linalg.so libtorch_cuda_linalg.so + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_global_deps.so libtorch_global_deps.so + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libcaffe2_nvrtc.so libcaffe2_nvrtc.so + COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_NVSHMEM} = 'ON' ]; then docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/lib/libtorch_nvshmem.so libtorch_nvshmem.so; fi" + COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -a -L pytorch_backend_ptlib:/usr/local/lib64/libtorchvision.so libtorchvision.so; else docker cp -a -L pytorch_backend_ptlib:/usr/local/${LIB_DIR}/libtorchvision.so.1 libtorchvision.so.1; fi; fi" + COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision; fi" + COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHTRT} = 'ON' ]; then docker cp pytorch_backend_ptlib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib/libtorchtrt_runtime.so libtorchtrt_runtime.so; fi" + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch_tensorrt/bin/torchtrtc torchtrtc || echo "error ignored..." || true COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/LICENSE LICENSE.pytorch - COMMAND docker cp pytorch_backend_ptlib:/opt/conda/lib/python3.8/site-packages/torch/include include/torch - COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/codegen - COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/vision/torchvision/csrc include/torchvision/torchvision - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_videoio.so.3.4.11 libopencv_videoio.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_highgui.so.3.4.11 libopencv_highgui.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_video.so.3.4.11 libopencv_video.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgcodecs.so.3.4.11 libopencv_imgcodecs.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_imgproc.so.3.4.11 libopencv_imgproc.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libopencv_core.so.3.4.11 libopencv_core.so - COMMAND docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so.16.37.0 libpng16.so - COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_def.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_def.so ]; then patchelf --add-needed libmkl_core.so libmkl_def.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx2.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_avx2.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx2.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_gnu_thread.so libmkl_avx512.so; fi" - COMMAND /bin/sh -c "if [ -f libmkl_avx512.so ]; then patchelf --add-needed libmkl_core.so libmkl_avx512.so; fi" + COMMAND docker cp pytorch_backend_ptlib:${PY_INSTALL_PATH}/torch/include include/torch + COMMAND docker cp pytorch_backend_ptlib:/opt/pytorch/pytorch/torch/csrc/jit/codegen include/torch/torch/csrc/jit/. + + COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libjpeg.so.62 libjpeg.so.62; else docker cp -L pytorch_backend_ptlib:/usr/local/lib/libjpeg.so.62 libjpeg.so.62 && docker cp pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libjpeg.so.8.2.2 libjpeg.so; fi;" + COMMAND /bin/sh -c "if [ ${RHEL_BUILD} = 'ON' ]; then docker cp -L pytorch_backend_ptlib:/usr/lib64/libpng16.so.16 libpng16.so.16; else docker cp -L pytorch_backend_ptlib:/usr/lib/${LIBS_ARCH}-linux-gnu/libpng16.so libpng16.so; fi;" + COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx2.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_avx2.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx2.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_avx512.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_avx512.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_avx512.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_gnu_thread.so.1 libmkl_vml_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_intel_thread.so.1 libmkl_vml_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_vml_def.so.1 ]; then patchelf --add-needed libmkl_core.so.1 libmkl_vml_def.so.1; fi" + COMMAND /bin/sh -c "if [ -f libmkl_intel_thread.so.1 ]; then patchelf --add-needed libmkl_intel_lp64.so.1 libmkl_intel_thread.so.1; fi" + COMMAND /bin/sh -c "if [ ${TRITON_PYTORCH_ENABLE_TORCHVISION} = 'ON' ]; then if [ ${RHEL_BUILD} = 'OFF' ]; then ln -s libtorchvision.so.1 libtorchvision.so; fi; fi;" COMMAND docker rm pytorch_backend_ptlib COMMENT "Extracting pytorch and torchvision libraries and includes from ${TRITON_PYTORCH_DOCKER_IMAGE}" VERBATIM ) - add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${OPENCV_LIBS}) + add_custom_target(ptlib_target DEPENDS ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS}) add_library(ptlib SHARED IMPORTED GLOBAL) add_dependencies(ptlib ptlib_target) @@ -235,6 +289,9 @@ add_library( src/libtorch.cc src/libtorch_utils.cc src/libtorch_utils.h + src/model_instance_state.cc + src/model_state.cc + src/string_utils.cc ) add_library( @@ -266,7 +323,7 @@ endif() # TRITON_PYTORCH_DOCKER_BUILD # Need to turn off -Werror due to Torchvision vision.h extern initialization # Unfortunately gcc does not provide a specific flag to ignore the specific # warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=45977 -target_compile_features(triton-pytorch-backend PRIVATE cxx_std_11) +target_compile_features(triton-pytorch-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-pytorch-backend PRIVATE $<$,$,$>: @@ -317,8 +374,8 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) if (${TRITON_PYTORCH_ENABLE_TORCHVISION}) set(TRITON_PYTORCH_LIBS - ${TRITON_PYTORCH_LIBS} - "${CMAKE_CURRENT_BINARY_DIR}/libtorchvision.so") + ${TRITON_PYTORCH_LIBS} + "${CMAKE_CURRENT_BINARY_DIR}/$,libtorchvision.so,libtorchvision.so.1>") endif() # TRITON_PYTORCH_ENABLE_TORCHVISION if (${TRITON_PYTORCH_ENABLE_TORCHTRT}) @@ -384,7 +441,7 @@ install( if (${TRITON_PYTORCH_DOCKER_BUILD}) set(PT_LIB_PATHS "") - FOREACH(plib ${PT_LIBS} ${OPENCV_LIBS}) + FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS}) set(PT_LIB_PATHS ${PT_LIB_PATHS} "${CMAKE_CURRENT_BINARY_DIR}/${plib}") ENDFOREACH(plib) @@ -403,7 +460,7 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) ) endif() # TRITON_PYTORCH_ENABLE_TORCHTRT - FOREACH(plib ${PT_LIBS} ${OPENCV_LIBS}) + FOREACH(plib ${PT_LIBS} ${LIBTORCH_LIBS} ${TORCHVISION_LIBS}) install( CODE "EXECUTE_PROCESS( @@ -416,23 +473,40 @@ if (${TRITON_PYTORCH_DOCKER_BUILD}) ) ENDFOREACH(plib) - set(OPENCV_VERSION "3.4") install( CODE "EXECUTE_PROCESS( - COMMAND ln -sf libopencv_video.so libopencv_video.so.${OPENCV_VERSION} - COMMAND ln -sf libopencv_videoio.so libopencv_videoio.so.${OPENCV_VERSION} - COMMAND ln -sf libopencv_highgui.so libopencv_highgui.so.${OPENCV_VERSION} - COMMAND ln -sf libopencv_imgcodecs.so libopencv_imgcodecs.so.${OPENCV_VERSION} - COMMAND ln -sf libopencv_imgproc.so libopencv_imgproc.so.${OPENCV_VERSION} - COMMAND ln -sf libopencv_core.so libopencv_core.so.${OPENCV_VERSION} COMMAND ln -sf libpng16.so libpng16.so.16 + COMMAND ln -sf libjpeg.so libjpeg.so.8 RESULT_VARIABLE LINK_STATUS WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) if(LINK_STATUS AND NOT LINK_STATUS EQUAL 0) message(FATAL_ERROR \"FAILED: to create links\") endif()" ) +else() + FOREACH(plib ${PT_LIBS}) + set(PT_LIB_PATHS ${PT_LIB_PATHS} "${TRITON_PYTORCH_LIB_PATHS}/${plib}") + ENDFOREACH(plib) + + install( + FILES + ${PT_LIB_PATHS} + DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/pytorch + ) + + FOREACH(plib ${PT_LIBS}) + install( + CODE + "EXECUTE_PROCESS( + COMMAND patchelf --set-rpath \$ORIGIN ${plib} + RESULT_VARIABLE PATCHELF_STATUS + WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/backends/pytorch) + if(PATCHELF_STATUS AND NOT PATCHELF_STATUS EQUAL 0) + message(FATAL_ERROR \"FAILED: to run patchelf\") + endif()" + ) + ENDFOREACH(plib) endif() # TRITON_PYTORCH_DOCKER_BUILD install( @@ -446,6 +520,13 @@ install( ${INSTALL_CONFIGDIR} ) +install( + FILES + src/model.py + DESTINATION + ${CMAKE_INSTALL_PREFIX}/backends/pytorch +) + include(CMakePackageConfigHelpers) configure_package_config_file( ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonPyTorchBackendConfig.cmake.in diff --git a/README.md b/README.md index 0eb8388..ccc803c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ +# PyTorch (LibTorch) Backend + [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) -# PyTorch (LibTorch) Backend +The Triton backend for +[PyTorch](https://github.com/pytorch/pytorch) +is designed to run +[TorchScript](https://pytorch.org/docs/stable/jit.html) +models using the PyTorch C++ API. +All models created in PyTorch using the python API must be traced/scripted to produce a TorchScript model. -The Triton backend for [PyTorch](https://github.com/pytorch/pytorch). -You can learn more about Triton backends in the [backend -repo](https://github.com/triton-inference-server/backend). Ask -questions or report problems on the [issues -page](https://github.com/triton-inference-server/server/issues). -This backend is designed to run [TorchScript](https://pytorch.org/docs/stable/jit.html) -models using the PyTorch C++ API. All models created in PyTorch -using the python API must be traced/scripted to produce a TorchScript -model. - -Where can I ask general questions about Triton and Triton backends? -Be sure to read all the information below as well as the [general -Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server) -available in the main [server](https://github.com/triton-inference-server/server) -repo. If you don't find your answer there you can ask questions on the -main Triton [issues page](https://github.com/triton-inference-server/server/issues). +You can learn more about Triton backends in the +[Triton Backend](https://github.com/triton-inference-server/backend) +repository. + +Ask questions or report problems using +[Triton Server issues](https://github.com/triton-inference-server/server/issues). + +Be sure to read all the information below as well as the +[general Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server) +available in the [Triton Server](https://github.com/triton-inference-server/server) repository. ## Build the PyTorch Backend -Use a recent cmake to build. First install the required dependencies. +Use a recent cmake to build. +First install the required dependencies. -``` -$ apt-get install patchelf rapidjson-dev python3-dev +```bash +apt-get install rapidjson-dev python3-dev python3-pip +pip3 install patchelf==0.17.2 ``` -An appropriate PyTorch container from [NGC](https://ngc.nvidia.com) must be used. -For example, to build a backend that uses the 21.02 version of the PyTorch -container from NGC: +An appropriate PyTorch container from [NVIDIA NGC Catalog](https://ngc.nvidia.com) must be used. +For example, to build a backend that uses the 23.04 version of the PyTorch container from NGC: -``` -$ mkdir build -$ cd build -$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:21.02-py3" .. -$ make install +```bash +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_DOCKER_IMAGE="nvcr.io/nvidia/pytorch:23.04-py3" .. +make install ``` -The following required Triton repositories will be pulled and used in -the build. By default the "main" branch/tag will be used for each repo -but the listed CMake argument can be used to override. +The following required Triton repositories will be pulled and used in the build. +By default, the `main` head will be used for each repository but the listed CMake argument can be used to override the value. -* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag] -* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag] -* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag] +* triton-inference-server/backend: `-DTRITON_BACKEND_REPO_TAG=[tag]` +* triton-inference-server/core: `-DTRITON_CORE_REPO_TAG=[tag]` +* triton-inference-server/common: `-DTRITON_COMMON_REPO_TAG=[tag]` ## Build the PyTorch Backend With Custom PyTorch -Currently, Triton requires that a specially patched version of -PyTorch be used with the PyTorch backend. The full source for -these PyTorch versions are available as Docker images from -[NGC](https://ngc.nvidia.com). For example, the PyTorch version -compatible with the 21.02 release of Triton is available as -nvcr.io/nvidia/pytorch:21.02-py3. +Currently, Triton requires that a specially patched version of PyTorch be used with the PyTorch backend. +The full source for these PyTorch versions are available as Docker images from +[NGC](https://ngc.nvidia.com). + +For example, the PyTorch version compatible with the 25.09 release of Triton is available as `nvcr.io/nvidia/pytorch:25.09-py3` which supports PyTorch version `2.9.0a0`. + +> [!NOTE] +> Additional details and version information can be found in the container's +> [release notes](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-09.html#rel-25-09). -Copy over the LibTorch and Torchvision headers and libraries from the +Copy over the LibTorch and TorchVision headers and libraries from the [PyTorch NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) -into local directories. You can see which headers and libraries -are needed/copied from the docker. +into local directories. +You can see which headers and libraries are needed/copied from the docker. + +```bash +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="/torch;/torch/torch/csrc/api/include;/torchvision" -DTRITON_PYTORCH_LIB_PATHS="" .. +make install +``` + +## Using the PyTorch Backend +### PyTorch 2.0 Models + +PyTorch 2.0 features are available. +However, Triton's PyTorch backend requires a serialized representation of the model in the form a `model.pt` file. +The serialized representation of the model can be generated using PyTorch's +[`torch.save()`](https://docs.pytorch.org/tutorials/beginner/saving_loading_models.html#id1) +function to generate the `model.pt` file. + +The model repository should look like: + +```bash +model_repository/ +`-- model_directory + |-- 1 + | `-- model.pt + `-- config.pbtxt ``` -$ mkdir build -$ cd build -$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_PYTORCH_INCLUDE_PATHS="/torch;/torch/torch/csrc/api/include;/torchvision" -DTRITON_PYTORCH_LIB_PATHS="" .. -$ make install + +Where `model.pt` is the serialized representation of the model. + +### TorchScript Models + +The model repository should look like: + +```bash +model_repository/ +`-- model_directory + |-- 1 + | `-- model.pt + `-- config.pbtxt ``` -## Using the PyTorch Backend +The `model.pt` is the TorchScript model file. + +## Configuration + +Triton exposes some flags to control the execution mode of the TorchScript models through the `Parameters` section of the model's `config.pbtxt` file. + +### Configuration Options + +* `default_model_name`: + Instructs the Triton PyTorch backend to load the model from a file of the given name. + + The model config specifying the option would look like: + + ```proto + default_model_name: "another_file_name.pt" + ``` ### Parameters -Triton exposes some flags to control the execution mode of the TorchScript models through -the Parameters section of the model's 'config.pbtxt' file. +* `DISABLE_OPTIMIZED_EXECUTION`: + Boolean flag to disable the optimized execution of TorchScript models. + By default, the optimized execution is always enabled. -* `DISABLE_OPTIMIZED_EXECUTION`: Boolean flag to disable the optimized execution -of TorchScript models. By default the optimized execuiton is always enabled. + The initial calls to a loaded TorchScript model take a significant amount of time. + Due to this longer model warmup + ([pytorch #57894](https://github.com/pytorch/pytorch/issues/57894)), + Triton also allows execution of models without these optimizations. + In some models, optimized execution does not benefit performance + ([pytorch #19978](https://github.com/pytorch/pytorch/issues/19978)) + and in other cases impacts performance negatively + ([pytorch #53824](https://github.com/pytorch/pytorch/issues/53824)). -The initial calls to a loaded TorchScript model take extremely long. Due to this longer -model warmup [issue](https://github.com/pytorch/pytorch/issues/57894), Triton also allows -execution of models without these optimizations. In some models, optimized execution -does not benefit performance as seen [here](https://github.com/pytorch/pytorch/issues/19978) -and in other cases impacts performance negatively, as seen [here](https://github.com/pytorch/pytorch/issues/53824). + The section of model config file specifying this parameter will look like: -The section of model config file specifying this parameter will look like: + ```proto + parameters: { + key: "DISABLE_OPTIMIZED_EXECUTION" + value: { string_value: "true" } + } + ``` -``` -parameters: { -key: "DISABLE_OPTIMIZED_EXECUTION" - value: { - string_value:"true" - } -} -``` +* `INFERENCE_MODE`: -* `INFERENCE_MODE`: Boolean flag to enable the Inference Mode execution -of TorchScript models. By default the inference mode is disabled. + Boolean flag to enable the Inference Mode execution of TorchScript models. + By default, the inference mode is enabled. -[InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new -RAII guard analogous to NoGradMode to be used when you are certain your operations -will have no interactions with autograd. Compared to NoGradMode, code run under -this mode gets better performance by disabling autograd. + [InferenceMode](https://pytorch.org/cppdocs/notes/inference_mode.html) is a new RAII guard analogous to `NoGradMode` to be used when you are certain your operations will have no interactions with autograd. + Compared to `NoGradMode`, code run under this mode gets better performance by disabling autograd. -Please note that in some models, InferenceMode might not benefit performance -and in fewer cases might impact performance negatively. + Please note that in some models, InferenceMode might not benefit performance and in fewer cases might impact performance negatively. -The section of model config file specifying this parameter will look like: + To enable inference mode, use the configuration example below: -``` -parameters: { -key: "INFERENCE_MODE" - value: { - string_value:"true" - } -} -``` + ```proto + parameters: { + key: "INFERENCE_MODE" + value: { string_value: "true" } + } + ``` -* `ENABLE_NVFUSER`: Boolean flag to enable the NvFuser (CUDA Graph -Fuser) optimization for TorchScript models. If not specified, the -default pytorch fuser is used. If `ENABLE_NVFUSER` is specified, the -`ENABLE_TENSOR_FUSER` configuration (see below) is ignored. +* `DISABLE_CUDNN`: -Please note that in some models generated using trace in old PyTorch versions might not work -correctly with NvFuser. We recommend using scripting and a recent version of PyTorch -to generate these models. + Boolean flag to disable the cuDNN library. + By default, cuDNN is enabled. -The section of model config file specifying this parameter will look like: + [cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for deep neural networks. + It provides highly tuned implementations for standard routines. -``` -parameters: { -key: "ENABLE_NVFUSER" - value: { - string_value:"true" - } -} -``` + Typically, models run with cuDNN enabled execute faster. + However there are some exceptions where using cuDNN can be slower, cause higher memory usage, or result in errors. -* `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to -share weights. This optimization should not be used with stateful models. If not specified, -weight sharing is disabled. + To disable cuDNN, use the configuration example below: -The section of model config file specifying this parameter will look like: + ```proto + parameters: { + key: "DISABLE_CUDNN" + value: { string_value: "true" } + } + ``` -``` -parameters: { -key: "ENABLE_WEIGHT_SHARING" - value: { - string_value:"true" - } -} -``` +* `ENABLE_WEIGHT_SHARING`: + + Boolean flag to enable model instances on the same device to share weights. + This optimization should not be used with stateful models. + If not specified, weight sharing is disabled. + + To enable weight sharing, use the configuration example below: + + ```proto + parameters: { + key: "ENABLE_WEIGHT_SHARING" + value: { string_value: "true" } + } + ``` + +* `ENABLE_CACHE_CLEANING`: + + Boolean flag to enable CUDA cache cleaning after each model execution. + If not specified, cache cleaning is disabled. + This flag has no effect if model is on CPU. -* Additional Optimizations: Three additional boolean parameters are available to disable -certain Torch optimizations that can sometimes cause latency regressions in models with -complex execution modes and dynamic shapes. If not specified, all are enabled by default. + Setting this flag to true will likely negatively impact the performance due to additional CUDA cache cleaning operation after each model execution. + Therefore, you should only use this flag if you serve multiple models with Triton and encounter CUDA out-of-memory issues during model executions. + + To enable cleaning of the CUDA cache after every execution, use the configuration example below: + + ```proto + parameters: { + key: "ENABLE_CACHE_CLEANING" + value: { string_value: "true" } + } + ``` + +* `INTER_OP_THREAD_COUNT`: + + PyTorch allows using multiple CPU threads during TorchScript model inference. + One or more inference threads execute a model’s forward pass on the given inputs. + Each inference thread invokes a JIT interpreter that executes the ops of a model inline, one by one. + + This parameter sets the size of this thread pool. + The default value of this setting is the number of cpu cores. + + > [!TIP] + > Refer to + > [CPU Threading TorchScript](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html) + > on how to set this parameter properly. + + To set the inter-op thread count, use the configuration example below: + + ```proto + parameters: { + key: "INTER_OP_THREAD_COUNT" + value: { string_value: "1" } + } + ``` + +> [!NOTE] +> This parameter is set globally for the PyTorch backend. +> The value from the first model config file that specifies this parameter will be used. +> Subsequent values from other model config files, if different, will be ignored. + +* `INTRA_OP_THREAD_COUNT`: + + In addition to the inter-op parallelism, PyTorch can also utilize multiple threads within the ops (intra-op parallelism). + This can be useful in many cases, including element-wise ops on large tensors, convolutions, GEMMs, embedding lookups and others. + + The default value for this setting is the number of CPU cores. + + > [!TIP] + > Refer to + > [CPU Threading TorchScript](https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html) + > on how to set this parameter properly. + + To set the intra-op thread count, use the configuration example below: + + ```proto + parameters: { + key: "INTRA_OP_THREAD_COUNT" + value: { string_value: "1" } + } + ``` + +* **Additional Optimizations**: + + Three additional boolean parameters are available to disable certain Torch optimizations that can sometimes cause latency regressions in models with complex execution modes and dynamic shapes. + If not specified, all are enabled by default. `ENABLE_JIT_EXECUTOR` `ENABLE_JIT_PROFILING` - `ENABLE_TENSOR_FUSER` - -### Important Note - -* The execution of pytorch model on GPU is asynchronous in nature. See - [here](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution) - for more details. Consequently, an error in pytorch model execution may - be raised during the next few inference requests to the server. Setting - environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will - help in correctly debugging failing cases by forcing synchronous execution. - * The PyTorch model in such cases may or may not recover from the failed - state and a restart of the server may be required to continue serving - successfully. - -* Multiple instances of the pytorch model on GPU do not always - increase performance. Due to thread specific caching in pytorch, using - multiple instances of the model interact negatively. See - [here](https://github.com/pytorch/pytorch/issues/27902) for more details. - Setting the parameter `DISABLE_OPTIMIZED_EXECUTION` to "true" in the model - configuration may help in some cases to avoid these negative interactions - due to model specific caching and increase multiple instance performance. +### Model Instance Group Kind + +The PyTorch backend supports the following kinds of +[Model Instance Groups](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) +where the input tensors are placed as follows: + +* `KIND_GPU`: + + Inputs are prepared on the GPU device associated with the model instance. + +* `KIND_CPU`: + + Inputs are prepared on the CPU. + +* `KIND_MODEL`: + + Inputs are prepared on the CPU. + When loading the model, the backend does not choose the GPU device for the model; + instead, it respects the device(s) specified in the model and uses them as they are during inference. + + This is useful when the model internally utilizes multiple GPUs, as demonstrated in + [this example model](https://github.com/triton-inference-server/server/blob/main/qa/L0_libtorch_instance_group_kind_model/gen_models.py). + + > [!IMPORTANT] + > If a device is not specified in the model, the backend uses the first available GPU device. + +To set the model instance group, use the configuration example below: + +```proto +instance_group { + count: 2 + kind: KIND_GPU +} +``` + +### Customization + +The following PyTorch settings may be customized by setting parameters on the +`config.pbtxt`. + +[`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads) + +* Key: `NUM_THREADS` +* Value: The number of threads used for intra-op parallelism on CPU. + +[`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads) + +* Key: `NUM_INTEROP_THREADS` +* Value: The number of threads used for interop parallelism (e.g. in JIT interpreter) on CPU. + +[`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) + +* Key: `TORCH_COMPILE_OPTIONAL_PARAMETERS` +* Value: Any of following parameter(s) encoded as a JSON object. + * `fullgraph` (`bool`): Whether it is ok to break model into several subgraphs. + * `dynamic` (`bool`): Use dynamic shape tracing. + * `backend` (`str`): The backend to be used. + * `mode` (`str`): Can be either `"default"`, `"reduce-overhead"`, or `"max-autotune"`. + * `options` (`dict`): A dictionary of options to pass to the backend. + * `disable` (`bool`): Turn `torch.compile()` into a no-op for testing. + +For example: + +```proto +parameters: { + key: "NUM_THREADS" + value: { string_value: "4" } +} +parameters: { + key: "TORCH_COMPILE_OPTIONAL_PARAMETERS" + value: { string_value: "{\"disable\": true}" } +} +``` + +## Important Notes + +* The execution of PyTorch model on GPU is asynchronous in nature. + See + [CUDA Asynchronous Execution](https://pytorch.org/docs/stable/notes/cuda.html#asynchronous-execution) + for additional details. + Consequently, an error in PyTorch model execution may be raised during the next few inference requests to the server. + Setting environment variable `CUDA_LAUNCH_BLOCKING=1` when launching server will help in correctly debugging failing cases by forcing synchronous execution. + + * The PyTorch model in such cases may or may not recover from the failed state and a restart of the server may be required to continue serving successfully. + +* PyTorch does not support Tensor of Strings but it does support models that accept a List of Strings as input(s) / produces a List of String as output(s). + For these models Triton allows users to pass String input(s)/receive String output(s) using the String datatype. + As a limitation of using List instead of Tensor for String I/O, only for 1-dimensional input(s)/output(s) are supported for I/O of String type. + +* In a multi-GPU environment, a potential runtime issue can occur when using + [Tracing](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) + to generate a + [TorchScript](https://pytorch.org/docs/stable/jit.html) + model. + This issue arises due to a device mismatch between the model instance and the tensor. + + By default, Triton creates a single execution instance of the model for each available GPU. + The runtime error occurs when a request is sent to a model instance with a different GPU device from the one used during the TorchScript generation process. + + To address this problem, it is highly recommended to use + [Scripting](https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script) + instead of Tracing for model generation in a multi-GPU environment. + Scripting avoids the device mismatch issue and ensures compatibility with different GPUs when used with Triton. + + However, if using Tracing is unavoidable, there is a workaround available. + You can explicitly specify the GPU device for the model instance in the + [model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) + to ensure that the model instance and the tensors used for inference are assigned to the same GPU device as on which the model was traced. + +* When using `KIND_MODEL` as model instance kind, the default device of the first parameter on the model is used. + +> [!WARNING] +> +> * Python functions optimizable by `torch.compile` may not be served directly in the `model.py` file, they need to be enclosed by a class extending the + [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). +> +> * Model weights cannot be shared across multiple instances on the same GPU device. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1a8da1f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +[tool.codespell] +# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - +# this is only to allow you to run codespell interactively +skip = "./.git,./.github" +# ignore short words, and typename parameters like OffsetT +ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" +# use the 'clear' dictionary for unambiguous spelling mistakes +builtin = "clear" +# disable warnings about binary files and wrong encoding +quiet-level = 3 + +[tool.isort] +profile = "black" +use_parentheses = true +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +ensure_newline_before_comments = true +line_length = 88 +balanced_wrapping = true +indent = " " +skip = ["build"] + diff --git a/src/libtorch.cc b/src/libtorch.cc index 6934a6c..500f1f5 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -1,4 +1,4 @@ -// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -24,1350 +24,13 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include -#include -#include "libtorch_utils.h" -#include "triton/backend/backend_common.h" -#include "triton/backend/backend_input_collector.h" -#include "triton/backend/backend_memory.h" -#include "triton/backend/backend_model.h" -#include "triton/backend/backend_model_instance.h" -#include "triton/backend/backend_output_responder.h" -#include "triton/core/tritonbackend.h" - -#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION -// Suppress warnings in torch headers -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wsign-compare" -#pragma warning(push, 0) -#include -#include // Torchvision header -#pragma warning(pop) -#pragma GCC diagnostic pop -#endif // TRITON_PYTORCH_ENABLE_TORCHVISION - -#ifdef TRITON_ENABLE_GPU -#include -#include -#endif // TRITON_ENABLE_GPU +#include "libtorch.hh" // // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API. // -namespace triton { namespace backend { namespace pytorch { - -// -// ModelState -// -// State associated with a model that is using this backend. An object -// of this class is created and associated with each -// TRITONBACKEND_Model. -// -class ModelState : public BackendModel { - public: - static TRITONSERVER_Error* Create( - TRITONBACKEND_Model* triton_model, ModelState** state); - virtual ~ModelState() = default; - - // Load a TorchScript model using 'artifact_name' as the name for the - // TorchScript file. Return in 'model_path' the full path to the - // TorchScript file, return in 'torch_model' the Torch Module - // representing the model. - TRITONSERVER_Error* LoadModel( - const std::string& artifact_name, const torch::Device device, - std::string* model_path, - std::shared_ptr* torch_model); - - bool EnabledOptimizedExecution() { return enable_optimized_execution_; } - const std::pair& EnabledTensorExprFuser() const - { - return enable_tensor_fuser_pair_; - } - const std::pair& EnabledJitProfiling() const - { - return enable_jit_profiling_pair_; - } - const std::pair& EnabledJitExecutor() const - { - return enable_jit_executor_pair_; - } - bool EnabledInferenceMode() { return enable_inference_mode_; } - const std::pair& EnabledNvfuserPair() const - { - return enable_nvfuser_pair_; - } - - bool EnabledWeightSharing() { return enable_weight_sharing_; } - - private: - ModelState(TRITONBACKEND_Model* triton_model); - TRITONSERVER_Error* AutoCompleteConfig(); - - // Parses and validates parameters in config - TRITONSERVER_Error* ParseParameters(); - - // Flag to indicate whether optimized execution is enabled. Defaults to true. - bool enable_optimized_execution_; - - // Flag to indicate whether inference mode is enabled. Defaults to false. - bool enable_inference_mode_; - - // Flag to indicate whether weight sharing is enabled. Defaults to false. - bool enable_weight_sharing_; - - // Flag pairs to indicate if various JIT settings are set and - // enabled respectively. Defaults to (false, true). Default behavior - // is to do nothing if not explicitly set. Tensor fuser flag is - // ignore if nvfuser is explicitly set. - std::pair enable_tensor_fuser_pair_; - std::pair enable_jit_profiling_pair_; - std::pair enable_jit_executor_pair_; - - // Flag pair to indicate whether nvfuser is set and enabled respectively. - // Defaults to (false, false). - std::pair enable_nvfuser_pair_; - - // Model mapping for shared TorchScript model across all instances on the - // same device. The key is a pair of isGPU and device index. - std::map< - std::pair, std::shared_ptr> - torch_models_; -}; - -TRITONSERVER_Error* -ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) -{ - try { - *state = new ModelState(triton_model); - } - catch (const BackendModelException& ex) { - RETURN_ERROR_IF_TRUE( - ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, - std::string("unexpected nullptr in BackendModelException")); - RETURN_IF_ERROR(ex.err_); - } - - // Auto-complete the configuration if requested... - bool auto_complete_config = false; - RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( - triton_model, &auto_complete_config)); - if (auto_complete_config) { - RETURN_IF_ERROR((*state)->AutoCompleteConfig()); - - triton::common::TritonJson::WriteBuffer json_buffer; - (*state)->ModelConfig().Write(&json_buffer); - - TRITONSERVER_Message* message; - RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson( - &message, json_buffer.Base(), json_buffer.Size())); - RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig( - triton_model, 1 /* config_version */, message)); - } - - RETURN_IF_ERROR((*state)->ParseParameters()); - - return nullptr; // success -} - -ModelState::ModelState(TRITONBACKEND_Model* triton_model) - : BackendModel(triton_model), enable_optimized_execution_(true), - enable_inference_mode_(false), enable_weight_sharing_(false), - enable_tensor_fuser_pair_({false, true}), - enable_jit_profiling_pair_({false, true}), - enable_jit_executor_pair_({false, true}), - enable_nvfuser_pair_({false, false}) -{ -} - -TRITONSERVER_Error* -ModelState::LoadModel( - const std::string& artifact_name, const torch::Device device, - std::string* model_path, - std::shared_ptr* torch_model) -{ - // Find the TorchScript file that describes the model. If the model - // configuration doesn't have an explicit model file specified then - // use the default name ("model.pt"). - std::string cc_model_filename = artifact_name; - if (cc_model_filename.empty()) { - cc_model_filename = "model.pt"; - } - - *model_path = JoinPath( - {RepositoryPath(), std::to_string(Version()), cc_model_filename}); - - { - bool exists; - RETURN_IF_ERROR(FileExists(*model_path, &exists)); - RETURN_ERROR_IF_FALSE( - exists, TRITONSERVER_ERROR_UNAVAILABLE, - std::string("unable to find '") + *model_path + - "' for model instance '" + Name() + "'"); - } - - // If weight sharing is enabled, skip loading model if - // it is already available on the target device - std::pair device_pair; - if (enable_weight_sharing_) { - device_pair = std::make_pair(!device.is_cpu(), device.index()); - auto mit = torch_models_.find(device_pair); - if (mit != torch_models_.end()) { - *torch_model = mit->second; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Reusing TorchScript model for instance '") + Name() + - "'") - .c_str()); - return nullptr; // success - } - } - - // Serialize the torch model to string - std::string model_data_str; - RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str)); - - // InferenceMode should be used to guard all tensors operations including - // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html - torch::InferenceMode infer_guard(EnabledInferenceMode()); - - try { - std::istringstream model_stream(model_data_str); - torch_model->reset( - new torch::jit::Module(torch::jit::load(model_stream, device))); - } - catch (const std::exception& ex) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("failed to load model '" + Name() + "': " + ex.what()).c_str()); - } - - if (enable_weight_sharing_) { - if (!((torch_models_.emplace(device_pair, *torch_model)).second)) { - std::string type = device.is_cpu() ? "CPU" : "GPU"; - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("Model already found on target ") + type + " device " + - "(id " + std::to_string(device.index()) + ") for '" + Name() + "'") - .c_str()); - } - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelState::AutoCompleteConfig() -{ - // Auto-complete configuration is not supported since PyTorch does not - // store/capture sufficient model metadata so just log error instead. - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("skipping model configuration auto-complete for '") + - Name() + "': not supported for pytorch backend") - .c_str()); - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelState::ParseParameters() -{ - triton::common::TritonJson::Value params; - bool status = model_config_.Find("parameters", ¶ms); - if (status) { - // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no - // update is made to 'enable_optimized_execution_'. - bool disable_optimized_execution = false; - TRITONSERVER_Error* err = ParseParameter( - params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - enable_optimized_execution_ = !disable_optimized_execution; - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Optimized execution is ") + - (enable_optimized_execution_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made - // to 'enable_inference_mode_'. - err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Inference Mode is ") + - (enable_inference_mode_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no - // update is made to 'enable_tensor_fuser'. - bool enable_tensor_fuser = false; - err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - enable_tensor_fuser_pair_ = {true, enable_tensor_fuser}; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Tensor fuser is ") + - (enable_tensor_fuser ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no - // update is made to 'enable_weight_sharing'. - err = ParseParameter( - params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Weight sharing is ") + - (enable_weight_sharing_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update - // is made to 'enable_jit_profiling'. - bool enable_jit_profiling = false; - err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - enable_jit_profiling_pair_ = {true, enable_jit_profiling}; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Jit profiling is ") + - (enable_jit_profiling ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is - // made to 'enable_jit_executor'. - bool enable_jit_executor = false; - err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - enable_jit_executor_pair_ = {true, enable_jit_executor}; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Jit executor is ") + - (enable_jit_executor ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // TODO Re-enable NvFuser once fixed - // If 'ENABLE_NVFUSER' is not present in 'parameters' then no - // update is made to 'enable_nvfuser'. - bool enable_nvfuser = false; - err = ParseParameter(params, "ENABLE_NVFUSER", &enable_nvfuser); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, (std::string("NvFuser is not specified") + - " for model instance '" + Name() + "'") - .c_str()); - TRITONSERVER_ErrorDelete(err); - } - } else { - // Override, disable NvFuser till fixed - enable_nvfuser = false; - enable_nvfuser_pair_ = {true, enable_nvfuser}; - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, (std::string("NvFuser is ") + - (enable_nvfuser ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - } - - return nullptr; -} - -// -// ModelInstanceState -// -// State associated with a model instance. An object of this class is -// created and associated with each TRITONBACKEND_ModelInstance. -// -class ModelInstanceState : public BackendModelInstance { - public: - static TRITONSERVER_Error* Create( - ModelState* model_state, - TRITONBACKEND_ModelInstance* triton_model_instance, - ModelInstanceState** state); - virtual ~ModelInstanceState(); - - // Get the state of the model that corresponds to this instance. - ModelState* StateForModel() const { return model_state_; } - - // Execute... - void ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count); - - private: - ModelInstanceState( - ModelState* model_state, - TRITONBACKEND_ModelInstance* triton_model_instance); - TRITONSERVER_Error* ValidateBooleanSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control); - TRITONSERVER_Error* ValidateTypedSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control); - TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt); - TRITONSERVER_Error* ValidateOutputs(); - void Execute( - std::vector* responses, - const uint32_t response_count, - std::vector* input_tensors, - std::vector* output_tensors); - TRITONSERVER_Error* SetInputTensors( - size_t total_batch_size, TRITONBACKEND_Request** requests, - const uint32_t request_count, - std::vector* responses, - BackendInputCollector* collector, std::vector* input_names, - std::vector* input_tensors, - std::vector* input_memories, bool* cuda_copy); - TRITONSERVER_Error* ReadOutputTensors( - size_t total_batch_size, const std::vector& output_names, - const std::vector& output_tensors, - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector* responses, - uint64_t* compute_end_ns); - - ModelState* model_state_; - - // The full path to the TorchScript model file. - std::string model_path_; - - std::shared_ptr torch_model_; - torch::Device device_; - - // Map from configuration name for an input to the index of - // that input in the model. - std::unordered_map input_index_map_; - - // Map from configuration name for an output to the index of - // that output in the model. - std::unordered_map output_index_map_; - std::unordered_map output_dtype_map_; - - // If the input to the tensor is a dictionary of tensors. - bool is_dict_input_; -}; - -TRITONSERVER_Error* -ModelInstanceState::Create( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, - ModelInstanceState** state) -{ - try { - *state = new ModelInstanceState(model_state, triton_model_instance); - } - catch (const BackendModelInstanceException& ex) { - RETURN_ERROR_IF_TRUE( - ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, - std::string("unexpected nullptr in BackendModelInstanceException")); - RETURN_IF_ERROR(ex.err_); - } - - return nullptr; // success -} - -ModelInstanceState::ModelInstanceState( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) - : BackendModelInstance(model_state, triton_model_instance), - model_state_(model_state), device_(torch::kCPU), is_dict_input_(false) -{ - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { - device_ = torch::Device(torch::kCUDA, DeviceId()); - } - - THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel( - ArtifactFilename(), device_, &model_path_, &torch_model_)); - - size_t expected_input_cnt = 0; - { - triton::common::TritonJson::Value inputs; - if (model_state->ModelConfig().Find("input", &inputs)) { - expected_input_cnt = inputs.ArraySize(); - } - } - - // If this is a sequence model then make sure that the required - // inputs are present in the model and have the correct shape and - // datatype. - triton::common::TritonJson::Value sequence_batching; - if (model_state->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - bool have_start, have_end, have_ready, have_corrid; - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_START", false /* required */, - &have_start)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_END", false /* required */, - &have_end)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */, - &have_ready)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */, - &have_corrid)); - if (have_start) { - expected_input_cnt += 1; - } - if (have_end) { - expected_input_cnt += 1; - } - if (have_ready) { - expected_input_cnt += 1; - } - if (have_corrid) { - expected_input_cnt += 1; - } - } - - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); -} - -ModelInstanceState::~ModelInstanceState() -{ - torch_model_.reset(); -#ifdef TRITON_ENABLE_GPU - if (device_.is_cuda()) { - c10::cuda::CUDACachingAllocator::emptyCache(); - } -#endif // TRITON_ENABLE_GPU -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateBooleanSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control) -{ - std::string tensor_name; - std::string tensor_datatype; - RETURN_IF_ERROR(GetBooleanSequenceControlProperties( - sequence_batching, model_state_->Name(), control_kind, required, - &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr, - nullptr, nullptr)); - *have_control = !tensor_name.empty(); - if (*have_control) { - std::string deliminator = "__"; - int ip_index = 0; - try { - int start_pos = tensor_name.find(deliminator); - if (start_pos == -1) { - throw std::invalid_argument("input must follow naming convention"); - } - ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); - input_index_map_[tensor_name] = ip_index; - } - catch (std::exception& ex) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow naming convention i.e. __.") - .c_str()); - } - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateTypedSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control) -{ - std::string tensor_name; - std::string tensor_datatype; - RETURN_IF_ERROR(GetTypedSequenceControlProperties( - sequence_batching, model_state_->Name(), control_kind, required, - &tensor_name, &tensor_datatype)); - *have_control = !tensor_name.empty(); - if (*have_control) { - std::string deliminator = "__"; - int ip_index = 0; - try { - int start_pos = tensor_name.find(deliminator); - if (start_pos == -1) { - throw std::invalid_argument("input must follow naming convention"); - } - ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); - input_index_map_[tensor_name] = ip_index; - } - catch (std::exception& ex) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow naming convention i.e. __.") - .c_str()); - } - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) -{ - // Collect all the expected input tensor names and validate that the model - // configuration specifies only those. - std::set allowed_inputs; - - const torch::jit::Method& method = torch_model_->get_method("forward"); - const auto& schema = method.function().getSchema(); - const std::vector& arguments = schema.arguments(); - - // Currently, only models with a single input of type Dict(str, Tensor) are - // supported. If the model expects more than one input then they must be all - // be of type Tensor. - // - // Ignore the argument at idx 0 if it is of Class type (self param in forward - // function) - size_t start_idx = 0; - if ((arguments.size() > 0) && - (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) { - start_idx = 1; - } - if ((arguments.size() == (1 + start_idx)) && - (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) { - is_dict_input_ = true; - } else if (arguments.size() > start_idx) { - // Return error if multiple inputs are of kind DictType - for (size_t i = start_idx + 1; i < arguments.size(); i++) { - if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "Multiple inputs of kind DictType were detected. Only a single " - "input of type Dict(str, Tensor) is supported."); - } - } - - // Return error if all inputs are not of type Tensor - for (size_t i = start_idx; i < arguments.size(); i++) { - if (arguments.at(i).type()->kind() != c10::TypeKind::TensorType) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("An input of type '") + arguments.at(i).type()->str() + - "' was detected in the model. Only a single input of type " - "Dict(str, Tensor) or input(s) of type Tensor are supported.") - .c_str()); - } - allowed_inputs.emplace(arguments.at(i).name()); - } - - // If all inputs are tensors, match number of expected inputs between model - // and configuration - if ((arguments.size() - start_idx) != expected_input_cnt) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unable to load model '") + model_state_->Name() + - "', configuration expects " + std::to_string(expected_input_cnt) + - " inputs, model provides " + - std::to_string(arguments.size() - start_idx)) - .c_str()); - } - } - - triton::common::TritonJson::Value ios; - RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios)); - std::string deliminator = "__"; - int ip_index = 0; - - if (ios.ArraySize() == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "model configuration must contain at least one input, none were " - "specified."); - } - - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - if (is_dict_input_) { - // If dictionary, index is irrelevant but we use the map to store the - // input names since they are the keys for the dictionary - input_index_map_[io_name] = i; - } else { - // input tensor name must be in 'allowed_inputs' or must follow the naming - // convention - auto itr = allowed_inputs.find(io_name); - if (itr != allowed_inputs.end()) { - input_index_map_[io_name] = std::distance(allowed_inputs.begin(), itr); - } else { - try { - int start_pos = io_name.find(deliminator); - if (start_pos == -1) { - throw std::invalid_argument("input must follow naming convention"); - } - ip_index = std::atoi(io_name.substr(start_pos + 2).c_str()); - input_index_map_[io_name] = ip_index; - } - catch (std::exception& ex) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + io_name + - "' is neither an input argument to the model nor does it " - "follow the naming convention i.e. __.") - .c_str()); - } - } - } - - // Validate data type - std::string io_dtype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); - const auto pr = ModelConfigDataTypeToTorchType(io_dtype); - if (!pr.first) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + io_dtype + " for input '" + io_name + - "' for model '" + model_state_->Name() + "'") - .c_str()); - } - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateOutputs() -{ - triton::common::TritonJson::Value ios; - RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios)); - std::string deliminator = "__"; - int op_index = 0; - - if (ios.ArraySize() == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "model configuration must contain at least one output, none were " - "specified."); - } - - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - try { - int start_pos = io_name.find(deliminator); - if (start_pos == -1) { - throw std::invalid_argument("output must follow naming convention"); - } - op_index = std::atoi(io_name.substr(start_pos + 2).c_str()); - } - catch (std::exception& ex) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("output '" + io_name + - "' does not follow naming convention i.e. __.") - .c_str()); - } - - // Validate data type - std::string io_dtype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); - const auto pr = ModelConfigDataTypeToTorchType(io_dtype); - if (!pr.first) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + io_dtype + " for output '" + io_name + - "' for model '" + model_state_->Name() + "'") - .c_str()); - } - output_index_map_[io_name] = op_index; - output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second); - } - - return nullptr; // success -} - -void -ModelInstanceState::ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + - std::to_string(request_count) + " requests") - .c_str()); - - uint64_t exec_start_ns = 0; - SET_TIMESTAMP(exec_start_ns); - - const int max_batch_size = model_state_->MaxBatchSize(); - - // For each request collect the total batch size for this inference - // execution. The batch-size, number of inputs, and size of each - // input has already been checked so don't need to do that here. - size_t total_batch_size = 0; - for (size_t i = 0; i < request_count; i++) { - // If we get a nullptr request then something is badly wrong. Fail - // and release all requests. - if (requests[i] == nullptr) { - RequestsRespondWithError( - requests, request_count, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "null request given to PyTorch backend for '" + Name() + "'") - .c_str())); - return; - } - } - - // At this point we are committed to running inference with all - // 'requests'. Create a response for each request. During input - // processing if there is an error with any request that error will - // be sent immediately with the corresponding response (and the - // response unique_ptr will then be nullptr). The request object - // itself will not be released until after all inferencing is done - // (below) as we may need to access the request object when - // determine how to process outputs (for example, even if we don't - // need the outputs for a request that has an error, we do need to - // know the size of those outputs associated with the request so we - // can skip them in the output tensors). - std::vector responses; - responses.reserve(request_count); - bool all_response_failed = false; - - for (size_t i = 0; i < request_count; i++) { - TRITONBACKEND_Response* response; - auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); - if (err == nullptr) { - responses.emplace_back(response); - } else { - responses.emplace_back(nullptr); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); - TRITONSERVER_ErrorDelete(err); - } - } - - - for (size_t i = 0; i < request_count; i++) { - if (max_batch_size > 0) { - // Retrieve the batch size from one of the inputs, if the model - // supports batching, the first dimension size is batch size - TRITONBACKEND_Input* input; - TRITONSERVER_Error* err = - TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); - if (err == nullptr) { - const int64_t* shape; - err = TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); - total_batch_size += shape[0]; - } - if (err != nullptr) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, err); - } - } else { - total_batch_size += 1; - } - } - - // If there are no valid payloads then no need to run the inference. - if (total_batch_size == 0) { - return; - } - - // Make sure the maximum batch size is not exceeded. The - // total_batch_size must be 1 for models that don't support batching - // (i.e. max_batch_size == 0). If max_batch_size is exceeded then - // scheduler has done something badly wrong so fail and release all - // requests. - if (!all_response_failed) { - if ((total_batch_size != 1) && - (total_batch_size > (size_t)max_batch_size)) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "batch size " + std::to_string(total_batch_size) + " for '" + - Name() + "', max allowed is " + - std::to_string(max_batch_size)) - .c_str())); - } - } - - std::vector input_names; - std::vector input_tensors; - std::vector input_memories; - bool cuda_copy = false; - std::unique_ptr collector; - - if (!all_response_failed) { - collector.reset(new BackendInputCollector( - requests, request_count, &responses, - model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(), - CudaStream(), nullptr, nullptr, 0, HostPolicyName().c_str())); - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - SetInputTensors( - total_batch_size, requests, request_count, &responses, - collector.get(), &input_names, &input_tensors, &input_memories, - &cuda_copy)); - } - - // Request to retrieve all model outputs. 'output_names' and - // 'output_tensors' are parallel vectors and so must be kept in - // sync. - std::vector output_names; - std::vector output_tensors; - if (!all_response_failed) { - triton::common::TritonJson::Value ios; - TRITONSERVER_Error* err = - model_state_->ModelConfig().MemberAsArray("output", &ios); - if (err == nullptr) { - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - err = ios.IndexAsObject(i, &io); - if (err != nullptr) { - break; - } - - // Use names from ModelConfig by reference since the model - // config will persist longer than this inference execution. - const char* io_name; - size_t io_name_len; - err = io.MemberAsString("name", &io_name, &io_name_len); - if (err != nullptr) { - break; - } - - output_names.emplace_back(io_name); - } - } - - if (err != nullptr) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, err); - output_names.clear(); - } - } - -// Wait for any in-flight input tensor copies to complete. -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(CudaStream()); - } -#endif - - uint64_t compute_start_ns = 0; - SET_TIMESTAMP(compute_start_ns); - - // Run... - if (!all_response_failed) { - Execute(&responses, request_count, &input_tensors, &output_tensors); - } - - // Free BackendMemory used for inputs - for (BackendMemory* mem : input_memories) { - if (mem != nullptr) { - delete mem; - } - } - input_memories.clear(); - - // Verify output indices are valid with number of outputs after execution - bool invalid_index = false; - int max_index = output_tensors.size() - 1; - - if (!all_response_failed) { - for (const auto& name : output_names) { - int op_index = output_index_map_[name]; - if ((op_index < 0) || (op_index > max_index)) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - std::string( - "The output " + std::string(name) + - " in the model configuration refers to an output index " - "which" - " doesn't exist. This model has " + - std::to_string(max_index + 1) + " outputs") - .c_str())); - invalid_index = true; - break; - } - } - } - - uint64_t compute_end_ns = 0; - - if (!all_response_failed) { - if (!invalid_index) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - ReadOutputTensors( - total_batch_size, output_names, output_tensors, requests, - request_count, &responses, &compute_end_ns)); - } - } - - uint64_t exec_end_ns = 0; - SET_TIMESTAMP(exec_end_ns); - - // Send all the responses that haven't already been sent because of - // an earlier error. Note that the responses are not set to nullptr - // here as we need that indication below to determine if the request - // we successful or not. - for (auto& response : responses) { - if (response != nullptr) { - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), - "failed to send PyTorch backend response"); - } - } - - // Report statistics for each request. - for (uint32_t r = 0; r < request_count; ++r) { - auto& request = requests[r]; - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportStatistics( - TritonModelInstance(), request, - (responses[r] != nullptr) /* success */, exec_start_ns, - compute_start_ns, compute_end_ns, exec_end_ns), - "failed reporting request statistics"); - - LOG_IF_ERROR( - TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), - "failed releasing request"); - } - - if (!all_response_failed) { - // Report the entire batch statistics. - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportBatchStatistics( - TritonModelInstance(), total_batch_size, exec_start_ns, - compute_start_ns, compute_end_ns, exec_end_ns), - "failed reporting batch request statistics"); - } -} - -void -ModelInstanceState::Execute( - std::vector* responses, - const uint32_t response_count, - std::vector* input_tensors, - std::vector* output_tensors) -{ - torch::jit::IValue model_outputs_; - - try { - // enable/disable optimized execution - torch::jit::setGraphExecutorOptimize( - model_state_->EnabledOptimizedExecution()); - - // enable/disable inference mode - supersedes NoGradGuard - torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); - - // JIT. No change is made unless parameter is explicitly set. - if (std::get<0>(model_state_->EnabledJitProfiling())) { - torch::jit::getProfilingMode() = - std::get<1>(model_state_->EnabledJitProfiling()); - } - - if (std::get<0>(model_state_->EnabledJitExecutor())) { - torch::jit::getExecutorMode() = - std::get<1>(model_state_->EnabledJitExecutor()); - } - - // Fuser. Parameter is ignored if NVFuser parameter is explicitily - // set (either enabled or disabled). No change is made unless - // fuser is explicitly set in parameters. - if (!std::get<0>(model_state_->EnabledNvfuserPair()) && - std::get<0>(model_state_->EnabledTensorExprFuser())) { - torch::jit::setTensorExprFuserEnabled( - std::get<1>(model_state_->EnabledTensorExprFuser())); - } - - // NV-Fuser. No change is made unless parameter is explicitly set. - if (std::get<0>(model_state_->EnabledNvfuserPair())) { - if (std::get<1>(model_state_->EnabledNvfuserPair()) && - (device_.type() != torch::kCPU)) { - torch::jit::overrideCanFuseOnCPU(false); - torch::jit::overrideCanFuseOnGPU(false); - torch::jit::setTensorExprFuserEnabled(false); - torch::jit::RegisterCudaFuseGraph::registerPass(true); - } else { - torch::jit::overrideCanFuseOnCPU(true); - torch::jit::overrideCanFuseOnGPU(true); - torch::jit::setTensorExprFuserEnabled(true); - torch::jit::RegisterCudaFuseGraph::registerPass(false); - } - } - - torch::NoGradGuard no_grad; - - // If input is a dictionary, prepare dictionary from 'input_tensors'. - if (is_dict_input_) { - torch::Dict input_dict; - for (auto& input_index : input_index_map_) { - torch::jit::IValue ival = (*input_tensors)[input_index.second]; - input_dict.insert(input_index.first, ival.toTensor()); - } - std::vector input_dict_ivalue = {input_dict}; - model_outputs_ = torch_model_->forward(input_dict_ivalue); - } else { - model_outputs_ = torch_model_->forward(*input_tensors); - } - - if (model_outputs_.isTuple()) { - auto model_outputs_tuple = model_outputs_.toTuple(); - for (auto& m_op : model_outputs_tuple->elements()) { - output_tensors->push_back(m_op.toTensor()); - } - } else { - try { - auto model_output_tensor = model_outputs_.toTensor(); - output_tensors->push_back(model_output_tensor); - } - catch (std::exception& exx) { - throw std::invalid_argument( - "Output of torch model should be tensor or a tuple of tensors, not " - "a list / dictionary of tensors or a scalar: " + - std::string(exx.what())); - } - } - } - catch (std::exception& ex) { - SendErrorForResponses( - responses, response_count, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("PyTorch execute failure: " + std::string(ex.what())).c_str())); - } -} - -TRITONSERVER_Error* -ModelInstanceState::SetInputTensors( - size_t total_batch_size, TRITONBACKEND_Request** requests, - const uint32_t request_count, - std::vector* responses, - BackendInputCollector* collector, std::vector* input_names, - std::vector* input_tensors, - std::vector* input_memories, bool* cuda_copy) -{ - const int max_batch_size = model_state_->MaxBatchSize(); - - // InferenceMode should be used to guard all tensors operations - torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); - - // All requests must have equally-sized input tensors so use any - // request as the representative for the input tensors. - uint32_t input_count; - RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count)); - input_tensors->resize(input_count); - for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) { - TRITONBACKEND_Input* input; - RETURN_IF_ERROR( - TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input)); - - const char* input_name; - TRITONSERVER_DataType input_datatype; - const int64_t* input_shape; - uint32_t input_dims_count; - RETURN_IF_ERROR(TRITONBACKEND_InputProperties( - input, &input_name, &input_datatype, &input_shape, &input_dims_count, - nullptr, nullptr)); - - input_names->emplace_back(input_name); - - // The shape for the entire input patch, [total_batch_size, ...] - std::vector batchn_shape( - input_shape, input_shape + input_dims_count); - if (max_batch_size != 0) { - batchn_shape[0] = total_batch_size; - } - - // The input must be in contiguous CPU/GPU memory. - std::vector> alloc_perference; - if (device_.is_cpu()) { - alloc_perference = {{TRITONSERVER_MEMORY_CPU_PINNED, 0}, - {TRITONSERVER_MEMORY_CPU, 0}}; - } else { - alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; - } - - const char* input_buffer; - size_t batchn_byte_size; - TRITONSERVER_MemoryType memory_type; - int64_t memory_type_id; - RETURN_IF_ERROR(collector->ProcessTensor( - input_name, nullptr, 0, alloc_perference, &input_buffer, - &batchn_byte_size, &memory_type, &memory_type_id)); - - // Create Torch tenor - const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype); - torch::TensorOptions options{torch_dtype.second}; - auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU) - ? options.device(torch::kCUDA, device_.index()) - : options.device(torch::kCPU); - - // Remove constness to align with the signature of torch::from_blob() - torch::Tensor input_tensor = torch::from_blob( - const_cast(input_buffer), batchn_shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; - } - - // Finalize... - *cuda_copy |= collector->Finalize(); - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::ReadOutputTensors( - size_t total_batch_size, const std::vector& output_names, - const std::vector& output_tensors, - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector* responses, uint64_t* compute_end_ns) -{ - BackendOutputResponder responder( - requests, request_count, responses, model_state_->TritonMemoryManager(), - model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(), - CudaStream()); - - bool cuda_copy = false; - std::vector> string_buffers; - for (size_t idx = 0; idx < output_names.size(); idx++) { - std::string name = output_names[idx]; - int op_index = output_index_map_[name]; - torch::Tensor output_flat; - - try { - output_flat = output_tensors[op_index].contiguous().flatten(); - } - catch (std::exception& ex) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("output tensor '") + name + "' is not found").c_str())); - } - - // Verify output datatype matches datatype from model config - TRITONSERVER_DataType output_dtype = - ConvertTorchTypeToDataType(output_flat.scalar_type()); - TRITONSERVER_DataType config_datatype = output_dtype_map_[name]; - if (config_datatype != output_dtype) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("configuration expects datatype TYPE_") + - TRITONSERVER_DataTypeString(config_datatype) + " for output '" + - name + "', model provides TYPE_" + - TRITONSERVER_DataTypeString(output_dtype)) - .c_str())); - } - - const char* output_buffer = - static_cast(output_flat.data_ptr()); - - // Output tensors may not reside on the same device as model - torch::Device tensor_device = output_flat.device(); - - // Set output shape - std::vector batchn_shape; - auto shape = output_tensors[op_index].sizes(); - for (auto itr = shape.begin(); itr != shape.end(); itr++) { - batchn_shape.push_back(*itr); - } - - if (batchn_shape.size() == 0) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("output '") + name + - "' is a scalar which is not supported.") - .c_str())); - } - - responder.ProcessTensor( - name, output_dtype, batchn_shape, output_buffer, - (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU - : TRITONSERVER_MEMORY_GPU, - (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index()); - - // PyTorch uses asynchronous execution to run the model. Setting the compute - // end timestamp immediately after Execute() does not capture the complete - // model execution time. When the first output buffer is accessed/copied by - // ProcessTensor(), there is a synchronization that is done to ensure the - // data is correctly copied from the output tensor. To avoid overheads of - // additional synchronization, we continue to use the default cuda stream. - // However the drawback of this is that the compute infer time reported - // would be slightly later than it is in reality and the compute output time - // reported would be smaller than it is in reality. We allow this because - // synchronizing manually negatively impacts performance. - if (idx == 0) { - SET_TIMESTAMP(*compute_end_ns); - } - } - - // Finalize and wait for any pending buffer copies. - cuda_copy |= responder.Finalize(); - -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - } -#endif // TRITON_ENABLE_GPU - - return nullptr; -} - -///////////// +namespace triton::backend::pytorch { extern "C" { @@ -1551,9 +214,13 @@ TRITONBACKEND_ModelInstanceExecute( // specific request. instance_state->ProcessRequests(requests, request_count); + if (model_state->EnabledCacheCleaning()) { + instance_state->ClearCache(); + } + return nullptr; // success -} +}; } // extern "C" -}}} // namespace triton::backend::pytorch +} // namespace triton::backend::pytorch diff --git a/src/libtorch.hh b/src/libtorch.hh new file mode 100644 index 0000000..4bd4700 --- /dev/null +++ b/src/libtorch.hh @@ -0,0 +1,59 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "model_instance_state.hh" +#include "model_state.hh" +#include "naming_convention.hh" +#include "string_utils.hh" + +// +// PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API. +// + +namespace triton::backend::pytorch { + +extern "C" { + +TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend); + +TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model); + +TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model); + +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize( + TRITONBACKEND_ModelInstance* instance); + +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize( + TRITONBACKEND_ModelInstance* instance); + +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( + TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, + const uint32_t request_count); + +} // extern "C" + + +} // namespace triton::backend::pytorch diff --git a/src/libtorch_utils.cc b/src/libtorch_utils.cc index a554ba9..bd7353b 100644 --- a/src/libtorch_utils.cc +++ b/src/libtorch_utils.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020-21 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2020-24 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -149,4 +149,31 @@ ParseParameter( return nullptr; } +TRITONSERVER_Error* +ParseParameter( + triton::common::TritonJson::Value& params, const std::string& mkey, + int* value) +{ + std::string value_str; + RETURN_IF_ERROR(GetParameterValue(params, mkey, &value_str)); + RETURN_IF_ERROR(ParseIntValue(value_str, value)); + + return nullptr; +} + + +#ifdef TRITON_ENABLE_GPU +TRITONSERVER_Error* +ConvertCUDAStatusToTritonError( + cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg) +{ + if (cuda_error != cudaSuccess) { + return TRITONSERVER_ErrorNew( + code, + (std::string(msg) + ": " + cudaGetErrorString(cuda_error)).c_str()); + } + return nullptr; // success +} +#endif + }}} // namespace triton::backend::pytorch diff --git a/src/libtorch_utils.h b/src/libtorch_utils.h index e112037..6ec325b 100644 --- a/src/libtorch_utils.h +++ b/src/libtorch_utils.h @@ -1,4 +1,4 @@ -// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -35,7 +35,6 @@ #pragma warning(push, 0) #include #include -#include #include #include #include // One-stop header for TorchScript @@ -51,11 +50,23 @@ std::pair ConvertDataTypeToTorchType( std::pair ModelConfigDataTypeToTorchType( const std::string& data_type_str); -// If the key 'mkey' is present in 'params' then update 'value' with the value -// associated with that key. If 'mkey' is not present in 'params' then no update -// is made to 'value'. +#ifdef TRITON_ENABLE_GPU +TRITONSERVER_Error* ConvertCUDAStatusToTritonError( + cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg); +#endif + +// If the key 'mkey' is present in 'params' then update 'value' with the +// value associated with that key. If 'mkey' is not present in 'params' then +// no update is made to 'value'. TRITONSERVER_Error* ParseParameter( triton::common::TritonJson::Value& params, const std::string& mkey, bool* value); +// If the key 'mkey' is present in 'params' then update 'value' with the +// value associated with that key. If 'mkey' is not present in 'params' then +// 'value' is set to 'default_value'. +TRITONSERVER_Error* ParseParameter( + triton::common::TritonJson::Value& params, const std::string& mkey, + int* value); + }}} // namespace triton::backend::pytorch diff --git a/src/model.py b/src/model.py new file mode 100755 index 0000000..d8ed413 --- /dev/null +++ b/src/model.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 + +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import importlib +import json +import os + +try: + import torch +except ModuleNotFoundError as error: + raise RuntimeError("Missing/Incomplete PyTorch package installation") from error + +import triton_python_backend_utils as pb_utils + + +def _get_model_path(config): + # FIXME: Add support for torch.export IR models (.pt2) + filenames = ["model.py", "model.pt"] + if config["default_model_filename"]: + filenames.insert(0, config["default_model_filename"]) + for filename in filenames: + model_path = os.path.join(pb_utils.get_model_dir(), filename) + if os.path.exists(model_path): + return model_path + raise pb_utils.TritonModelException( + "No model found in " + pb_utils.get_model_dir() + "/" + str(filenames) + ) + + +def _get_model_data_path(model_path): + data_path_extensions = [".pt"] + model_path_no_extension = model_path[: -(len(model_path.split(".")[-1]) + 1)] + for extension in data_path_extensions: + data_path = model_path_no_extension + extension + if os.path.exists(data_path): + return data_path + # data file not provided + return "" + + +def _is_py_class_model(model_path): + return model_path[-3:] == ".py" + + +def _import_module_from_path(module_name, file_path): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def _get_model_class_from_module(module): + names = dir(module) + for name in names: + attr = getattr(module, name) + try: + if issubclass(attr, torch.nn.Module): + return attr + except TypeError: + # attr may not be a class + pass + raise pb_utils.TritonModelException("Cannot find a subclass of torch.nn.Module") + + +def _parse_io_config(io_config): + io = [] + for conf in io_config: + io.append({"name": conf["name"]}) + return io + + +def _get_device_name(kind, device_id): + if kind == "GPU": + return "cuda:" + device_id + if kind == "CPU": + return "cpu" + # unspecified device + return "" + + +def _get_device(kind, device_id, model): + device_name = _get_device_name(kind, device_id) + if device_name == "": + for param in model.parameters(): + return param.device + raise pb_utils.TritonModelException("Cannot determine model device") + return torch.device(device_name) + + +def _set_torch_parallelism(config): + log_msg = "" + parallelism_settings = ["NUM_THREADS", "NUM_INTEROP_THREADS"] + for setting in parallelism_settings: + val = "1" + if setting in config["parameters"]: + val = config["parameters"][setting]["string_value"] + getattr(torch, "set_" + setting.lower())(int(val)) + log_msg += setting + " = " + val + "; " + return log_msg + + +def _get_torch_compile_params(config): + params = {} + if "TORCH_COMPILE_OPTIONAL_PARAMETERS" in config["parameters"]: + val = config["parameters"]["TORCH_COMPILE_OPTIONAL_PARAMETERS"]["string_value"] + params = json.loads(val) + if "model" in params: + raise pb_utils.TritonModelException( + "'model' is not an optional parameter for 'torch.compile'" + ) + return params + + +def _gather_torch_tensors(scatter_tensors): + gather_tensors = [] + sections = [] + for i in range(len(scatter_tensors)): + tensors = scatter_tensors[i] + for j in range(len(tensors)): + tensor = tensors[j] + if j < len(gather_tensors): + # add to existing tensor + gather_tensors[j] = torch.cat((gather_tensors[j], tensor), 0) + else: + # start a new tensor + gather_tensors.append(tensor) + # record section + section_length = tensors[0].size()[0] + sections.append(section_length) + return gather_tensors, sections + + +def _scatter_torch_tensors(gather_tensors, sections): + scatter_tensors = [] + for j in range(len(gather_tensors)): + scatter_tensor = torch.split(gather_tensors[j], sections) + for i in range(len(scatter_tensor)): + tensor = scatter_tensor[i] + if i < len(scatter_tensors): + # add to existing response + scatter_tensors[i].append(tensor) + else: + # start a new response + scatter_tensors.append([tensor]) + return scatter_tensors + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self._model_name = args["model_name"] + for_model = "for '" + self._model_name + "'" + self._logger = pb_utils.Logger + self._logger.log_info("Initializing model instance " + for_model) + + self._model_config = json.loads(args["model_config"]) + self._kind = args["model_instance_kind"] + self._device_id = args["model_instance_device_id"] + self._support_batching = self._model_config["max_batch_size"] > 0 + self._inputs = _parse_io_config(self._model_config["input"]) + self._outputs = _parse_io_config(self._model_config["output"]) + + setting_msg = _set_torch_parallelism(self._model_config) + self._logger.log_verbose( + "Torch parallelism settings " + for_model + ": " + setting_msg + ) + + self._infer_mode = torch.inference_mode(mode=True) + self._infer_mode.__enter__() + + params = _get_torch_compile_params(self._model_config) + self._logger.log_verbose( + "'torch.compile' optional parameter(s) " + for_model + ": " + str(params) + ) + if self._support_batching: + self._gather = torch.compile(_gather_torch_tensors, **params) + self._scatter = torch.compile(_scatter_torch_tensors, **params) + + model_path = _get_model_path(self._model_config) + if not _is_py_class_model(model_path): + self._logger.log_info("Loading '" + self._model_name + "' as TorchScript") + self._model = torch.jit.load(model_path) + self._device = _get_device(self._kind, self._device_id, self._model) + self._model.to(self._device) + self._model.eval() + return + + self._model_module = _import_module_from_path(self._model_name, model_path) + self._model_class = _get_model_class_from_module(self._model_module) + self._raw_model = self._model_class() + self._device = _get_device(self._kind, self._device_id, self._raw_model) + data_path = _get_model_data_path(model_path) + if data_path != "": + self._raw_model.load_state_dict( + torch.load(data_path, map_location=self._device) + ) + else: + self._logger.log_info("Model parameter file not found " + for_model) + self._raw_model.to(self._device) + self._raw_model.eval() + self._model = torch.compile(self._raw_model, **params) + + def execute(self, requests): + """`execute` MUST be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + requests_tensors = [] + for request in requests: + tensors = [] + for io in self._inputs: + tensor = pb_utils.get_input_tensor_by_name( + request, io["name"] + ).to_dlpack() + tensor = torch.from_dlpack(tensor).to(self._device) + tensors.append(tensor) + requests_tensors.append(tensors) + + sections = None + if self._support_batching: + requests_tensors, sections = self._gather(requests_tensors) + requests_tensors = [requests_tensors] + + responses_tensors = [] + for input_tensors in requests_tensors: + output_tensors = self._model(*input_tensors) + if not isinstance(output_tensors, tuple) and not isinstance( + output_tensors, list + ): + output_tensors = [output_tensors] + responses_tensors.append(output_tensors) + + if self._support_batching: + responses_tensors = self._scatter(responses_tensors[0], sections) + + for response_tensors in responses_tensors: + output_tensors = [] + for i in range(len(self._outputs)): + io = self._outputs[i] + tensor = response_tensors[i].detach() + tensor = pb_utils.Tensor.from_dlpack(io["name"], tensor) + output_tensors.append(tensor) + inference_response = pb_utils.InferenceResponse( + output_tensors=output_tensors + ) + responses.append(inference_response) + + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + self._logger.log_info("Removing model instance for '" + self._model_name + "'") + self._infer_mode.__exit__(exc_type=None, exc_value=None, traceback=None) diff --git a/src/model_instance_state.cc b/src/model_instance_state.cc new file mode 100644 index 0000000..19cae27 --- /dev/null +++ b/src/model_instance_state.cc @@ -0,0 +1,1633 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "model_instance_state.hh" + +#include "string_utils.hh" + +#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION +// Suppress warnings in torch headers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma warning(push, 0) +#include +#include // Torchvision header +#pragma warning(pop) +#pragma GCC diagnostic pop +#endif // TRITON_PYTORCH_ENABLE_TORCHVISION + +#ifdef TRITON_ENABLE_GPU +#include +#include +#include +#endif // TRITON_ENABLE_GPU + + +namespace triton::backend::pytorch { + +ModelInstanceState::ModelInstanceState( + ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) + : BackendModelInstance(model_state, triton_model_instance), + model_state_(model_state), device_(torch::kCPU), is_dict_input_(false), + device_cnt_(0) +{ + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + device_ = torch::Device(torch::kCUDA, DeviceId()); + CreateCudaEvents(DeviceId()); +#endif + } + +#ifdef TRITON_ENABLE_GPU + device_cnt_ = torch::cuda::device_count(); +#endif + + THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel( + ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_)); + + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { +#ifdef TRITON_ENABLE_GPU + // Since we cannot determine the exact devices used by the model, we create + // a CUDA stream for every available device to ensure proper synchronization + // of CUDA streams. This approach may have implications when a timestamp is + // captured on a device that is not used by the model. Currently, this issue + // is addressed by synchronizing the CUDA streams before recording + // timestamps to prevent timestamp skewing. However, in the future, any + // modifications to the CUDA stream synchronization logic should be handled + // with caution. + for (int i = 0; i < device_cnt_; i++) { + cudaStream_t stream; + THROW_IF_BACKEND_INSTANCE_ERROR( + CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream)); + stream_vec_.push_back(stream); + } + if (!stream_vec_.empty()) { + // Create CUDA events on the first device that will be used for collecting + // inputs/outputs. + CreateCudaEvents(0); + } +#endif + } + + size_t expected_input_cnt = 0; + { + triton::common::TritonJson::Value inputs; + if (model_state->ModelConfig().Find("input", &inputs)) { + expected_input_cnt = inputs.ArraySize(); + } + + triton::common::TritonJson::Value config_batch_inputs; + if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) { + batch_input_count_ = config_batch_inputs.ArraySize(); + expected_input_cnt += batch_input_count_; + } + } + + // If this is a sequence model then make sure that the required + // inputs are present in the model and have the correct shape and + // datatype. + triton::common::TritonJson::Value sequence_batching; + if (model_state->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + bool have_start, have_end, have_ready, have_corrid; + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_START", false /* required */, + &have_start)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_END", false /* required */, + &have_end)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */, + &have_ready)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */, + &have_corrid)); + if (have_start) { + expected_input_cnt += 1; + } + if (have_end) { + expected_input_cnt += 1; + } + if (have_ready) { + expected_input_cnt += 1; + } + if (have_corrid) { + expected_input_cnt += 1; + } + // Add the state inputs to the expected count + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + expected_input_cnt += states.ArraySize(); + } + } + supports_batching_ = model_state_->MaxBatchSize() > 0; + + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); +} + +ModelInstanceState::~ModelInstanceState() +{ + torch_model_.reset(); + ClearCache(); + + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { +#ifdef TRITON_ENABLE_GPU + for (size_t i = 0; i < stream_vec_.size(); i++) { + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL, + "Failed to set the device"), + "Failed to set the device"); + + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL, + "Failed to destroy cuda stream"), + "~ModelInstanceState error: "); + stream_vec_[i] = nullptr; + } +#endif + } +} + +void +ModelInstanceState::AddInputToMap( + NamingConvention naming_convention, + const std::vector allowed_inputs, const std::string& io_name, + const uint32_t index) +{ + std::string deliminator = "__"; + + if (is_dict_input_) { + // If dictionary, index is irrelevant but we use the map to store the + // input names since they are the keys for the dictionary + input_index_map_[io_name] = index; + } else { + switch (naming_convention) { + case NamingConvention::FORWARD_ARGUMENT: { + auto itr = + std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name); + if (itr != allowed_inputs.end()) { + input_index_map_[io_name] = + std::distance(allowed_inputs.begin(), itr); + } + return; + } + case NamingConvention::NAMED_INDEX: { + int start_pos = io_name.find(deliminator); + int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str()); + input_index_map_[io_name] = ip_index; + return; + } + case NamingConvention::STRICT_CONFIG_ORDERING: { + input_index_map_[io_name] = index; + return; + } + } + } +} + +void +ModelInstanceState::ClearCache() +{ +#ifdef TRITON_ENABLE_GPU + if (device_.is_cuda() || + ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { + c10::cuda::CUDACachingAllocator::emptyCache(); + } +#endif // TRITON_ENABLE_GPU +} + +TRITONSERVER_Error* +ModelInstanceState::Create( + ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state) +{ + try { + *state = new ModelInstanceState(model_state, triton_model_instance); + } + catch (const BackendModelInstanceException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelInstanceException")); + RETURN_IF_ERROR(ex.err_); + } + + return nullptr; // success +} + +void +ModelInstanceState::Execute( + std::vector* responses, + const uint32_t response_count, + std::vector* input_tensors, + std::vector* output_tensors) +{ + NVTX_RANGE(nvtx_, "Execute " + Name()); + + torch::jit::IValue model_outputs_; + + try { + // enable/disable optimized execution + torch::jit::setGraphExecutorOptimize( + model_state_->EnabledOptimizedExecution()); + + // enable/disable inference mode - supersedes NoGradGuard + torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); + + // enable/disable cudnn + at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn()); + + // JIT. No change is made unless parameter is explicitly set. + if (std::get<0>(model_state_->EnabledJitProfiling())) { + torch::jit::getProfilingMode() = + std::get<1>(model_state_->EnabledJitProfiling()); + } + + if (std::get<0>(model_state_->EnabledJitExecutor())) { + torch::jit::getExecutorMode() = + std::get<1>(model_state_->EnabledJitExecutor()); + } + + // Fuser. No change is made unless fuser is explicitly set in + // parameters. + if (std::get<0>(model_state_->EnabledTensorExprFuser())) { + torch::jit::setTensorExprFuserEnabled( + std::get<1>(model_state_->EnabledTensorExprFuser())); + } + + torch::NoGradGuard no_grad; + + // If input is a dictionary, prepare dictionary from 'input_tensors'. + if (is_dict_input_) { + torch::Dict input_dict; + for (auto& input_index : input_index_map_) { + torch::jit::IValue ival = (*input_tensors)[input_index.second]; + input_dict.insert(input_index.first, ival.toTensor()); + } + std::vector input_dict_ivalue = {input_dict}; + model_outputs_ = torch_model_->forward(input_dict_ivalue); + } else { + model_outputs_ = torch_model_->forward(*input_tensors); + } + + if (model_outputs_.isTuple()) { + auto model_outputs_tuple = model_outputs_.toTuple(); + size_t op_index = 0; + for (auto& m_op : model_outputs_tuple->elements()) { + if (m_op.isList()) { + auto list_output = m_op.toList(); + if (list_output.elementType()->kind() != c10::TypeKind::StringType) { + throw std::invalid_argument( + "output at index " + std::to_string(op_index) + + " must be of type Tensor or List[str], received List[" + + list_output.elementType()->str() + "]"); + } + output_tensors->push_back(m_op); + } else { + auto tensor_output = m_op.toTensor(); + output_tensors->push_back(m_op); + } + op_index++; + } + } else if (model_outputs_.isTensor()) { + output_tensors->push_back(model_outputs_); + } else if (model_outputs_.isList()) { + auto list_output = model_outputs_.toList(); + if (list_output.elementType()->kind() != c10::TypeKind::StringType) { + throw std::invalid_argument( + "output must be of type Tensor or List[str], received List[" + + list_output.elementType()->str() + "]"); + } + output_tensors->push_back(model_outputs_); + } else { + throw std::invalid_argument( + "output must be of type Tensor, List[str] or Tuple containing one of " + "these two types. It should not be a List / Dictionary of Tensors or " + "a Scalar"); + } + } + catch (std::exception& ex) { + SendErrorForResponses( + responses, response_count, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("PyTorch execute failure: " + std::string(ex.what())).c_str())); + } +} + +float +ModelInstanceState::GetCudaEventElapsedTime( + const cudaEvent_t& start_event, const cudaEvent_t& end_event) +{ + float duration = 0; +#ifdef TRITON_ENABLE_GPU + // [FIXME] in the case of cudaEventElapsedTime failure, should handle + // stats reporting more gracefully as the durations are inaccurate + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaEventElapsedTime(&duration, start_event, end_event), + TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), + "Failed to capture elapsed time"); +#endif + return duration; +} + + +cudaStream_t +ModelInstanceState::GetCudaStreamByInstanceKind() +{ +#ifdef TRITON_ENABLE_GPU + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { + return stream_; + } else if ( + (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && + !stream_vec_.empty()) { + return stream_vec_[0]; + } +#endif + return nullptr; +} + +TRITONSERVER_Error* +ModelInstanceState::GetNamingConvention( + NamingConvention* naming_convention, + const std::vector& allowed_ios) +{ + // Rules for (non-Dictionary) input tensor names: + // 1. Must be in 'allowed_inputs' (arguments in the forward function) + // 2. Must follow the naming convention i.e. __ + // 3. If neither of the above conditions are satisfied, enforce strict + // ordering of model inputs. + // + // Rules for output tensor names: + // 1. Must follow the naming convention i.e. __ + // 2. If not, we enforce strict ordering of model outputs. + std::string deliminator = "__"; + std::string io_kind = "input"; + *naming_convention = NamingConvention::FORWARD_ARGUMENT; + + // symbolizes output + if (allowed_ios.size() == 0) { + io_kind = "output"; + *naming_convention = NamingConvention::NAMED_INDEX; + } + + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR( + model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios)); + + if (io_kind == "input") { + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name); + if (itr == allowed_ios.end()) { + *naming_convention = NamingConvention::NAMED_INDEX; + break; + } + } + } + + // If not, check if inputs follow INDEX + if (*naming_convention == NamingConvention::NAMED_INDEX) { + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + int start_pos = io_name.find(deliminator); + if (start_pos == -1) { + *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; + break; + } else { + // check if the index part of the name is not an integer + std::string index_str = io_name.substr(start_pos + 2); + bool is_int = true; + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + is_int = false; + } + } + + if (!is_int) { + if (io_kind == "input") { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + ("input '" + io_name + + "' or previous input(s) are neither an input argument to the " + "model '" + + model_state_->Name() + + "' nor do they follow the __ naming convention. " + "Falling back to enforcing strict ordering from model " + "configuration.") + .c_str()); + } else { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + ("output '" + io_name + + "' or previous output(s) of the model '" + + model_state_->Name() + + "' do not follow the __ naming convention. " + "Falling back to enforcing strict ordering from model " + "configuration.") + .c_str()); + } + *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; + break; + } + } + } + } + + triton::common::TritonJson::Value sequence_batching; + if (model_state_->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + // If we need to manage state for the model, then we need to check + // the naming of the state adheres to both the input and output conventions + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + if (*naming_convention != NamingConvention::NAMED_INDEX) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + ("PyTorch model '" + model_state_->Name() + + "' is using sequence batching with state but not all inputs and " + "outputs follow the __ naming convention. ") + .c_str()); + } + } + + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string name_entry = + io_kind == "input" ? "input_name" : "output_name"; + std::string state_name; + RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name)); + int start_pos = state_name.find(deliminator); + if (start_pos == -1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + ("PyTorch model '" + model_state_->Name() + + "' is using sequence batching with state but state '" + + state_name + + "' does not follow the __ naming convention. ") + .c_str()); + } else { + // check if the index part of the name is not an integer + std::string index_str = state_name.substr(start_pos + 2); + bool is_int = true; + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + is_int = false; + } + } + if (!is_int) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + ("PyTorch model '" + model_state_->Name() + + "' is using sequence batching with state but state '" + + state_name + + "' does not follow the __ naming convention. ") + .c_str()); + } + } + } + } + + return nullptr; // success +} + +void +ModelInstanceState::ProcessRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count) +{ + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + + std::to_string(request_count) + " requests") + .c_str()); + +#ifdef TRITON_ENABLE_GPU + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { + SetCurrentCudaStream(stream_, DeviceId()); + } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { + // Replace the default stream of each device with the one we created. + for (size_t i = 0; i < stream_vec_.size(); i++) { + SetCurrentCudaStream(stream_vec_[i], i); + } + } +#endif + + NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); + + uint64_t exec_start_ns = 0; + SET_TIMESTAMP(exec_start_ns); + + const int max_batch_size = model_state_->MaxBatchSize(); + + // For each request collect the total batch size for this inference + // execution. The batch-size, number of inputs, and size of each + // input has already been checked so don't need to do that here. + size_t total_batch_size = 0; + for (size_t i = 0; i < request_count; i++) { + // If we get a nullptr request then something is badly wrong. Fail + // and release all requests. + if (requests[i] == nullptr) { + RequestsRespondWithError( + requests, request_count, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "null request given to PyTorch backend for '" + Name() + "'") + .c_str())); + return; + } + } + + // At this point we are committed to running inference with all + // 'requests'. Create a response for each request. During input + // processing if there is an error with any request that error will + // be sent immediately with the corresponding response (and the + // response unique_ptr will then be nullptr). The request object + // itself will not be released until after all inferencing is done + // (below) as we may need to access the request object when + // determine how to process outputs (for example, even if we don't + // need the outputs for a request that has an error, we do need to + // know the size of those outputs associated with the request so we + // can skip them in the output tensors). + std::vector responses; + responses.reserve(request_count); + bool all_response_failed = false; + + for (size_t i = 0; i < request_count; i++) { + TRITONBACKEND_Response* response; + auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); + if (err == nullptr) { + responses.emplace_back(response); + } else { + responses.emplace_back(nullptr); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); + TRITONSERVER_ErrorDelete(err); + } + } + + for (size_t i = 0; i < request_count; i++) { + if (max_batch_size > 0) { + // Retrieve the batch size from one of the inputs, if the model + // supports batching, the first dimension size is batch size. + TRITONBACKEND_Input* input; + TRITONSERVER_Error* err = + TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); + if (err == nullptr) { + const int64_t* shape; + err = TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); + total_batch_size += shape[0]; + } + if (err != nullptr) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, err); + } + } else { + total_batch_size += 1; + } + } + + // If there are no valid payloads then no need to run the inference. + if (total_batch_size == 0) { + return; + } + + // Make sure the maximum batch size is not exceeded. The + // total_batch_size must be 1 for models that don't support batching + // (i.e. max_batch_size == 0). If max_batch_size is exceeded then + // scheduler has done something badly wrong so fail and release all + // requests. + if (!all_response_failed) { + if ((total_batch_size != 1) && + (total_batch_size > (size_t)max_batch_size)) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "batch size " + std::to_string(total_batch_size) + " for '" + + Name() + "', max allowed is " + + std::to_string(max_batch_size)) + .c_str())); + } + } + + std::vector input_names; + std::vector input_tensors; + bool cuda_copy = false; + std::unique_ptr collector; + + // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute + // input duration since only one stream will be used for input collection. + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || + ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { +#ifdef TRITON_ENABLE_GPU + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ConvertCUDAStatusToTritonError( + cudaEventRecord( + compute_input_start_event_, GetCudaStreamByInstanceKind()), + TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); +#endif + } + + if (!all_response_failed) { + collector.reset(new BackendInputCollector( + requests, request_count, &responses, + model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(), + GetCudaStreamByInstanceKind(), nullptr, nullptr, 0, + HostPolicyName().c_str())); + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + SetInputTensors( + total_batch_size, requests, request_count, &responses, + collector.get(), &input_names, &input_tensors, &cuda_copy)); + } + +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(GetCudaStreamByInstanceKind()); + cuda_copy = false; + } +#endif + + std::vector output_tensors; + uint64_t compute_start_ns = 0; + uint64_t compute_infer_start = 0; + + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_start_ns, + reinterpret_cast(&compute_infer_start_event_))); + + // For 'KIND_MODEL', capture the timestamp for the compute infer duration. + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { + SET_TIMESTAMP(compute_infer_start); + } + + // Run... + if (!all_response_failed) { + Execute(&responses, request_count, &input_tensors, &output_tensors); + } + + // Verify output indices are valid with number of outputs after execution + bool invalid_index = false; + int max_index = output_tensors.size() - 1; + + if (!all_response_failed) { + for (const auto& name : model_state_->ModelOutputs()) { + int op_index = output_index_map_[name.first]; + if ((op_index < 0) || (op_index > max_index)) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + std::string( + "The output " + std::string(name.first) + + " in the model configuration refers to an output index " + "which doesn't exist. This model has " + + std::to_string(max_index + 1) + " outputs") + .c_str())); + invalid_index = true; + break; + } + } + } + +#ifdef TRITON_ENABLE_GPU + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { + // For 'KIND_MODEL', multiple streams will be involved, so we need to call + // 'cudaStreamSynchronize' before reading the output tensors. + for (auto& stream : stream_vec_) { + cudaStreamSynchronize(stream); + } + } +#endif + + uint64_t compute_end_ns = 0; + uint64_t compute_output_start = 0; + + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { +#ifdef TRITON_ENABLE_GPU + SET_TIMESTAMP(compute_output_start); +#endif + } else { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_end_ns, + reinterpret_cast(&compute_output_start_event_))); + } + + if (!all_response_failed) { + if (!invalid_index) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ReadOutputTensors( + total_batch_size, output_tensors, requests, request_count, + &responses)); + } + } + + uint64_t exec_end_ns = 0; + SET_TIMESTAMP(exec_end_ns); + + // Send all the responses that haven't already been sent because of + // an earlier error. Note that the responses are not set to nullptr + // here as we need that indication below to determine if the request + // we successful or not. + for (auto& response : responses) { + if (response != nullptr) { + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), + "failed to send PyTorch backend response"); + } + } + + // We don't need an explicit CUDA syncrhonization here since we have already + // synchronized the stream in the ReadOutputTensors function. + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + float compute_input_duration = GetCudaEventElapsedTime( + compute_input_start_event_, compute_infer_start_event_); + float compute_infer_duration = GetCudaEventElapsedTime( + compute_infer_start_event_, compute_output_start_event_); + + compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); + compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); +#endif + } else if ( + (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { +#ifdef TRITON_ENABLE_GPU + float compute_input_duration = GetCudaEventElapsedTime( + compute_input_start_event_, compute_infer_start_event_); + uint64_t compute_infer_duration = + compute_output_start - compute_infer_start; + + compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); + compute_end_ns = compute_start_ns + compute_infer_duration; +#endif + } + + // Report statistics for each request. + for (uint32_t r = 0; r < request_count; ++r) { + auto& request = requests[r]; + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportStatistics( + TritonModelInstance(), request, + (responses[r] != nullptr) /* success */, exec_start_ns, + compute_start_ns, compute_end_ns, exec_end_ns), + "failed reporting request statistics"); + + LOG_IF_ERROR( + TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), + "failed releasing request"); + } + + if (!all_response_failed) { + // Report the entire batch statistics. + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportBatchStatistics( + TritonModelInstance(), total_batch_size, exec_start_ns, + compute_start_ns, compute_end_ns, exec_end_ns), + "failed reporting batch request statistics"); + } +} + +TRITONSERVER_Error* +ModelInstanceState::ReadOutputTensors( + size_t total_batch_size, + const std::vector& output_tensors, + TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector* responses) +{ + NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name()); + + BackendOutputResponder responder( + requests, request_count, responses, model_state_->TritonMemoryManager(), + model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(), + GetCudaStreamByInstanceKind()); + + bool cuda_copy = false; + // The serialized string buffer must be valid until output copies are done + std::vector> string_buffer; + for (auto& output : model_state_->ModelOutputs()) { + int op_index = output_index_map_[output.first]; + auto name = output.first; + auto output_tensor_pair = output.second; + + if (output_tensors[op_index].isTensor()) { + torch::Tensor output_flat; + try { + output_flat = + output_tensors[op_index].toTensor().contiguous().flatten(); + } + catch (std::exception& ex) { + RETURN_IF_ERROR(TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("output tensor '") + name + "' is not found") + .c_str())); + } + + // Verify output datatype matches datatype from model config + TRITONSERVER_DataType output_dtype = + ConvertTorchTypeToDataType(output_flat.scalar_type()); + TRITONSERVER_DataType config_datatype = output_dtype_map_[name]; + if (config_datatype != output_dtype) { + RETURN_IF_ERROR(TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("configuration expects datatype TYPE_") + + TRITONSERVER_DataTypeString(config_datatype) + " for output '" + + name + "', model provides TYPE_" + + TRITONSERVER_DataTypeString(output_dtype)) + .c_str())); + } + + const char* output_buffer = + static_cast(output_flat.data_ptr()); + + // Output tensors may not reside on the same device as model + torch::Device tensor_device = output_flat.device(); + const auto memory_type = (tensor_device.type() == torch::kCPU) + ? TRITONSERVER_MEMORY_CPU + : TRITONSERVER_MEMORY_GPU; + const auto memory_id = + (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index(); + + // Batch output doesn't support string data type yet, as it is not trivial + // to parse string output + const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name); + if (batch_output == nullptr) { + // Get output shape + std::vector batchn_shape; + auto shape = output_tensors[op_index].toTensor().sizes(); + for (auto itr = shape.begin(); itr != shape.end(); itr++) { + batchn_shape.push_back(*itr); + } + + if (batchn_shape.size() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("output '") + name + + "' is a scalar which is not supported.") + .c_str()); + } + if (output_tensor_pair.first != -1) { + responder.ProcessTensor( + name, output_dtype, batchn_shape, output_buffer, memory_type, + memory_id); + } + if (output_tensor_pair.second != -1) { + std::vector states; + states = responder.ProcessStateTensor( + name, output_dtype, batchn_shape, output_buffer, memory_type, + memory_id); + // Update the states + for (auto& state : states) { + RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state)); + } + } + + } else { + responder.ProcessBatchOutput( + name, *batch_output, output_buffer, memory_type, memory_id); + } + } else if (output_tensors[op_index].isList()) { + // Custom handling for string/bytes tensor... + torch::List output_list = + output_tensors[op_index].toList(); + + // Get output shape + std::vector batchn_shape{(int64_t)output_list.size()}; + + for (size_t idx = 0; idx < responses->size(); idx++) { + auto& request = requests[idx]; + auto& response = (*responses)[idx]; + + if (supports_batching_ != 0) { + TRITONBACKEND_Input* input; + TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input); + const int64_t* shape; + TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); + batchn_shape[0] = shape[0]; + } + + int64_t tensor_element_cnt = 0; + RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt)); + + // Only need an response tensor for requested outputs. + if (response != nullptr) { + if (output_tensor_pair.first != -1) { + TRITONBACKEND_Output* response_output; + RESPOND_AND_SET_NULL_IF_ERROR( + &response, TRITONBACKEND_ResponseOutput( + response, &response_output, name.c_str(), + TRITONSERVER_TYPE_BYTES, batchn_shape.data(), + batchn_shape.size())); + string_buffer.emplace_back(new std::string()); + cuda_copy |= SetStringOutputBuffer( + &output_list, &response, response_output, tensor_element_cnt, + GetCudaStreamByInstanceKind(), string_buffer.back().get()); + } + } + if (output_tensor_pair.second != -1) { + TRITONBACKEND_State* response_state; + RESPOND_AND_SET_NULL_IF_ERROR( + &response, TRITONBACKEND_StateNew( + &response_state, request, name.c_str(), + TRITONSERVER_TYPE_BYTES, batchn_shape.data(), + batchn_shape.size())); + + string_buffer.emplace_back(new std::string()); + cuda_copy |= SetStringStateBuffer( + &output_list, &response, response_state, tensor_element_cnt, + GetCudaStreamByInstanceKind(), string_buffer.back().get()); + } + } + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("output '") + name + + "' must be of type Tensor or List[str].") + .c_str()); + } + } + + // Finalize and wait for any pending buffer copies. + cuda_copy |= responder.Finalize(); + +#ifdef TRITON_ENABLE_GPU + // We have to always synchronize the stream. This is to make sure that + // the events on the cuda stream are synchronized. Otherwise, the events + // are only guaranteed to be synchronized if the model provides the output + // on GPU. + cudaStreamSynchronize(GetCudaStreamByInstanceKind()); +#endif + + return nullptr; +} + +TRITONSERVER_Error* +ModelInstanceState::RecordBackendTimestamp( + uint64_t* timestamp, void* cuda_event) +{ + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || + ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { +#ifdef TRITON_ENABLE_GPU + cudaEvent_t* lcuda_event = reinterpret_cast(cuda_event); + RETURN_IF_ERROR(ConvertCUDAStatusToTritonError( + cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()), + TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); +#endif + } else { + SET_TIMESTAMP(*timestamp); + } + return nullptr; +} + +void +ModelInstanceState::SetCurrentCudaStream( + const cudaStream_t& stream, const int& device_id) +{ +#ifdef TRITON_ENABLE_GPU + at::cuda::CUDAStream torch_stream = + at::cuda::getStreamFromExternal(stream, device_id); + // This function replaces the default stream with the stream we created. It + // is not necessary to change the current device to the desired device when + // replacing the default stream for that device. See the documentation here: + // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html + at::cuda::setCurrentCUDAStream(torch_stream); +#endif +} + +TRITONSERVER_Error* +ModelInstanceState::SetInputTensors( + size_t total_batch_size, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::vector* responses, + BackendInputCollector* collector, std::vector* input_names, + std::vector* input_tensors, bool* cuda_copy) +{ + // InferenceMode should be used to guard all tensors operations + torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); + + // All requests must have equally-sized input tensors so use any + // request as the representative for the input tensors. + uint32_t input_count; + RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count)); + + input_tensors->resize(input_count + batch_input_count_); + + // The inputs must be in contiguous CPU/GPU memory. + std::vector> alloc_perference; + if (device_.is_cpu()) { + alloc_perference = { + {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; + } else { + alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; + } + + for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) { + TRITONBACKEND_Input* input; + RETURN_IF_ERROR( + TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input)); + + const char* input_name; + TRITONSERVER_DataType input_datatype; + const int64_t* input_shape; + uint32_t input_dims_count; + RETURN_IF_ERROR(TRITONBACKEND_InputProperties( + input, &input_name, &input_datatype, &input_shape, &input_dims_count, + nullptr, nullptr)); + + input_names->emplace_back(input_name); + + // The shape for the entire input patch, + // [total_batch_size, ...] for non-ragged input and + // [total_element_count] for ragged input (non-nested tensor) + std::vector batchn_shape; + if (StateForModel()->IsInputRagged(input_name)) { + batchn_shape = std::vector{0}; + for (size_t idx = 0; idx < request_count; idx++) { + TRITONBACKEND_Input* input; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); + const int64_t* input_shape; + uint32_t input_dims_count; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &input_shape, + &input_dims_count, nullptr, nullptr)); + + int64_t element_cnt = 0; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + GetElementCount(input_shape, input_dims_count, &element_cnt)); + batchn_shape[0] += element_cnt; + } + } else { + batchn_shape = + std::vector(input_shape, input_shape + input_dims_count); + if (supports_batching_) { + batchn_shape[0] = total_batch_size; + } + } + + // The input must be in contiguous CPU/GPU memory. + std::vector> alloc_perference; + // For 'KIND_MODEL', input will always be in CPU as we don't have a way to + // query the input types. + if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) { + alloc_perference = { + {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; + } else { + alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; + } + + const char* input_buffer; + size_t batchn_byte_size; + TRITONSERVER_MemoryType memory_type; + int64_t memory_type_id; + RETURN_IF_ERROR(collector->ProcessTensor( + input_name, nullptr, 0, alloc_perference, &input_buffer, + &batchn_byte_size, &memory_type, &memory_type_id)); + + // Create Torch tensor + const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype); + torch::TensorOptions options{torch_dtype.second}; + auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU) + ? options.device(torch::kCUDA, device_.index()) + : options.device(torch::kCPU); + + if (input_datatype == TRITONSERVER_TYPE_BYTES) { + // Create the PyTorch list to hold the strings. + torch::List input_list; + input_list.reserve(batchn_shape[0]); + + for (size_t idx = 0; idx < request_count; idx++) { + TRITONBACKEND_Input* input; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); + const int64_t* shape; + uint32_t dims_count; + uint32_t buffer_count; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_InputPropertiesForHostPolicy( + input, HostPolicyName().c_str(), nullptr, nullptr, &shape, + &dims_count, nullptr, &buffer_count)); + + int64_t batch_element_cnt = 0; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + GetElementCount(shape, dims_count, &batch_element_cnt)); + + *cuda_copy |= SetStringInputTensor( + &input_list, input, input_name, buffer_count, batch_element_cnt, + &((*responses)[idx]), GetCudaStreamByInstanceKind(), + HostPolicyName().c_str()); + } + + (*input_tensors)[input_index_map_[input_name]] = input_list; + } else { + if (batchn_byte_size) { + // Remove constness to align with the signature of torch::from_blob() + torch::Tensor input_tensor = torch::from_blob( + const_cast(input_buffer), batchn_shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } else { + // torch:from_blob seems not working when the input size is 0 + // create zero-length inputs directly + torch::Tensor input_tensor = + torch::zeros(batchn_shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } + } + } + + for (const auto& batch_input : StateForModel()->BatchInputs()) { + std::vector shape; + collector->BatchInputShape(batch_input, &shape); + + for (const auto& input_name : batch_input.TargetNames()) { + input_names->emplace_back(input_name.c_str()); + + const char* dst_buffer; + size_t dst_buffer_byte_size; + TRITONSERVER_MemoryType dst_memory_type; + int64_t dst_memory_type_id; + + RESPOND_ALL_AND_SET_NULL_IF_ERROR( + (*responses), responses->size(), + collector->ProcessBatchInput( + batch_input, nullptr, 0, alloc_perference, &dst_buffer, + &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id)); + + const auto torch_dtype = + ConvertDataTypeToTorchType(batch_input.DataType()); + torch::TensorOptions options{torch_dtype.second}; + auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU) + ? options.device(torch::kCUDA, device_.index()) + : options.device(torch::kCPU); + + if (dst_buffer_byte_size) { + torch::Tensor input_tensor = torch::from_blob( + const_cast(dst_buffer), shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } else { + // special handle when input has zero size + torch::Tensor input_tensor = torch::zeros(shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } + } + } + + // Finalize... + *cuda_copy |= collector->Finalize(); + + return nullptr; +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateBooleanSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control) +{ + std::string tensor_name; + std::string tensor_datatype; + RETURN_IF_ERROR(GetBooleanSequenceControlProperties( + sequence_batching, model_state_->Name(), control_kind, required, + &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr)); + *have_control = !tensor_name.empty(); + if (*have_control) { + std::string deliminator = "__"; + int ip_index = 0; + int start_pos = tensor_name.find(deliminator); + if (start_pos == -1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + + // check if the index part of the name is not an integer + std::string index_str = tensor_name.substr(start_pos + 2); + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + } + + ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); + input_index_map_[tensor_name] = ip_index; + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) +{ + // Collect all the expected input tensor names and validate that the model + // configuration specifies only those. + std::vector allowed_inputs; + + const torch::jit::Method& method = torch_model_->get_method("forward"); + const auto& schema = method.function().getSchema(); + const std::vector& arguments = schema.arguments(); + + // Currently, only models with a single input of type Dict(str, Tensor) are + // supported. If the model expects more than one input then they must be all + // be of type Tensor. + // + // Ignore the argument at idx 0 if it is of Class type (self param in forward + // function) + size_t start_idx = 0; + if ((arguments.size() > 0) && + (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) { + start_idx = 1; + } + if ((arguments.size() == (1 + start_idx)) && + (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) { + is_dict_input_ = true; + } else if (arguments.size() > start_idx) { + // Return error if multiple inputs are of kind DictType + for (size_t i = start_idx + 1; i < arguments.size(); i++) { + if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "Multiple inputs of kind DictType were detected. Only a single " + "input of type Dict(str, Tensor) is supported."); + } + } + + // Return error if all inputs are not of type Tensor + for (size_t i = start_idx; i < arguments.size(); i++) { + if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) && + (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("An input of type '") + arguments.at(i).type()->str() + + "' was detected in the model. Only a single input of type " + "Dict(str, Tensor) or input(s) of type Tensor are supported.") + .c_str()); + } + allowed_inputs.emplace_back(arguments.at(i).name()); + } + + // If all inputs are tensors, match number of expected inputs between model + // and configuration + if ((arguments.size() - start_idx) != expected_input_cnt) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("unable to load model '") + model_state_->Name() + + "', configuration expects " + std::to_string(expected_input_cnt) + + " inputs, model provides " + + std::to_string(arguments.size() - start_idx)) + .c_str()); + } + } + + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios)); + + if (ios.ArraySize() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "model configuration must contain at least one input, none were " + "specified."); + } + + NamingConvention naming_convention; + RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs)); + + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + AddInputToMap(naming_convention, allowed_inputs, io_name, i); + // Validate data type + std::string io_dtype; + RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); + const auto pr = ModelConfigDataTypeToTorchType(io_dtype); + if (!pr.first && (io_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + io_dtype + " for input '" + io_name + + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String inputs. Only allow 1 dimension. + if (io_dtype == "TYPE_STRING") { + // If a reshape is provided for the input then use that when + // validating the model shapes. + std::vector dims; + triton::common::TritonJson::Value reshape; + if (io.Find("reshape", &reshape)) { + RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); + } else { + RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); + } + + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as input for " + "'" + + std::string(io_name) + "' for model '" + model_state_->Name() + + "'") + .c_str()); + } + } + } + triton::common::TritonJson::Value sequence_batching; + if (model_state_->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string state_name; + RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name)); + AddInputToMap(naming_convention, allowed_inputs, state_name, i); + + // Validate data type + std::string state_dtype; + RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); + const auto pr = ModelConfigDataTypeToTorchType(state_dtype); + if (!pr.first && (state_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + state_dtype + " for input state '" + + state_name + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String inputs. Only allow 1 dimension. + if (state_dtype == "TYPE_STRING") { + std::vector dims; + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as input " + "for " + "'" + + std::string(state_name) + "' for model '" + + model_state_->Name() + "'") + .c_str()); + } + } + } + } + } + + triton::common::TritonJson::Value batch_inputs; + RETURN_IF_ERROR( + model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs)); + size_t i = 0; + for (const auto& batch_input : StateForModel()->BatchInputs()) { + for (const auto& input_name : batch_input.TargetNames()) { + AddInputToMap( + naming_convention, allowed_inputs, input_name, i + ios.ArraySize()); + i++; + } + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateOutputs() +{ + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios)); + std::string deliminator = "__"; + int op_index = 0; + + if (ios.ArraySize() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "model configuration must contain at least one output, none were " + "specified."); + } + + NamingConvention naming_convention; + RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {})); + + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + switch (naming_convention) { + case NamingConvention::NAMED_INDEX: { + int start_pos = io_name.find(deliminator); + op_index = std::atoi(io_name.substr(start_pos + 2).c_str()); + break; + } + case NamingConvention::STRICT_CONFIG_ORDERING: { + op_index = i; + break; + } + default: + break; + } + + // Validate data type + std::string io_dtype; + RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); + const auto pr = ModelConfigDataTypeToTorchType(io_dtype); + if (!pr.first && (io_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + io_dtype + " for output '" + io_name + + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String outputs. Only allow 1 dimension. + if (io_dtype == "TYPE_STRING") { + // If a reshape is provided for the output then use that when + // validating the model shapes. + std::vector dims; + triton::common::TritonJson::Value reshape; + if (io.Find("reshape", &reshape)) { + RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); + } else { + RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); + } + + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as output for " + "'" + + std::string(io_name) + "' for model '" + model_state_->Name() + + "'") + .c_str()); + } + } + + output_index_map_[io_name] = op_index; + output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second); + } + + triton::common::TritonJson::Value sequence_batching; + if (model_state_->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string state_name; + RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name)); + std::string state_dtype; + RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); + std::vector dims; + RETURN_IF_ERROR(ParseShape(state, "dims", &dims)); + + // For state, naming convention is enforced to be NAMED_INDEX + int start_pos = state_name.find(deliminator); + op_index = std::atoi(state_name.substr(start_pos + 2).c_str()); + + const auto pr = ModelConfigDataTypeToTorchType(state_dtype); + if (!pr.first && (state_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + state_dtype + " for state '" + + state_name + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String outputs. Only allow 1 dimension. + if (state_dtype == "TYPE_STRING") { + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as output " + "for " + "'" + + std::string(state_name) + "' for model '" + + model_state_->Name() + "'") + .c_str()); + } + } + + output_index_map_[state_name] = op_index; + output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second); + } + } + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateTypedSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control) +{ + std::string tensor_name; + std::string tensor_datatype; + RETURN_IF_ERROR(GetTypedSequenceControlProperties( + sequence_batching, model_state_->Name(), control_kind, required, + &tensor_name, &tensor_datatype)); + *have_control = !tensor_name.empty(); + if (*have_control) { + std::string deliminator = "__"; + int ip_index = 0; + int start_pos = tensor_name.find(deliminator); + if (start_pos == -1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + + // check if the index part of the name is not an integer + std::string index_str = tensor_name.substr(start_pos + 2); + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + } + + // check if the data type is supported by PyTorch + if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + "' type '" + tensor_datatype + + "' is not supported by PyTorch.") + .c_str()); + } + + ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); + input_index_map_[tensor_name] = ip_index; + } + + return nullptr; // success +} + + +} // namespace triton::backend::pytorch diff --git a/src/model_instance_state.hh b/src/model_instance_state.hh new file mode 100644 index 0000000..b495510 --- /dev/null +++ b/src/model_instance_state.hh @@ -0,0 +1,178 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "libtorch_utils.h" +#include "model_state.hh" +#include "naming_convention.hh" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + + +namespace triton::backend::pytorch { + +// +// ModelInstanceState +// +// State associated with a model instance. An object of this class is +// created and associated with each TRITONBACKEND_ModelInstance. +// +class ModelInstanceState : public BackendModelInstance { + private: + ModelState* model_state_; + + // The full path to the TorchScript model file. + std::string model_path_; + + std::shared_ptr torch_model_; + torch::Device device_; + + // Map from configuration name for an input to the index of + // that input in the model. + std::unordered_map input_index_map_; + uint32_t batch_input_count_ = 0; + + // Map from configuration name for an output to the index of + // that output in the model. + std::unordered_map output_index_map_; + std::unordered_map output_dtype_map_; + + // If the input to the tensor is a dictionary of tensors. + bool is_dict_input_; + + // If the model supports batching. + bool supports_batching_; + + cudaEvent_t compute_input_start_event_; + cudaEvent_t compute_infer_start_event_; + cudaEvent_t compute_output_start_event_; + + // Store the cuda streams created for the 'KIND_MODEL' instance group. + std::vector stream_vec_; + + // The number of available devices. + int device_cnt_; + + public: + virtual ~ModelInstanceState(); + + // Clear CUDA cache + void ClearCache(); + + static TRITONSERVER_Error* Create( + ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state); + + // Execute... + void ProcessRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count); + + // Get the state of the model that corresponds to this instance. + ModelState* StateForModel() const; + + private: + ModelInstanceState( + ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance); + + void AddInputToMap( + NamingConvention naming_convention, + const std::vector allowed_inputs, const std::string& io_name, + const uint32_t index); + + // Create CUDA events for statistics collection. + void CreateCudaEvents(const int32_t& device_id); + + void Execute( + std::vector* responses, + const uint32_t response_count, + std::vector* input_tensors, + std::vector* output_tensors); + + // Get the elapsed time between two CUDA events. + float GetCudaEventElapsedTime( + const cudaEvent_t& start_event, const cudaEvent_t& end_event); + + // Get the appropriate CUDA stream for input and output handling based on + // the instance group type. + cudaStream_t GetCudaStreamByInstanceKind(); + + // Get the naming convention for inputs/outputs from the model configuration + TRITONSERVER_Error* GetNamingConvention( + NamingConvention* naming_convention, + const std::vector& allowed_io); + + TRITONSERVER_Error* ReadOutputTensors( + size_t total_batch_size, + const std::vector& output_tensors, + TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector* responses); + + TRITONSERVER_Error* RecordBackendTimestamp( + uint64_t* timestamp, void* cuda_event); + + // Replace the default CUDA stream with the stream we created to ensure + // proper cuda stream synchronization. + void SetCurrentCudaStream( + const cudaStream_t& stream, const int32_t& device_id); + + TRITONSERVER_Error* SetInputTensors( + size_t total_batch_size, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::vector* responses, + BackendInputCollector* collector, std::vector* input_names, + std::vector* input_tensors, bool* cuda_copy); + + TRITONSERVER_Error* ValidateBooleanSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control); + + TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt); + + TRITONSERVER_Error* ValidateOutputs(); + + TRITONSERVER_Error* ValidateTypedSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control); +}; + +} // namespace triton::backend::pytorch diff --git a/src/model_state.cc b/src/model_state.cc new file mode 100644 index 0000000..b007438 --- /dev/null +++ b/src/model_state.cc @@ -0,0 +1,495 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "model_state.hh" + +#include + + +namespace { +std::once_flag pytorch_interop_threads_flag; +std::once_flag pytorch_intraop_threads_flag; +} // namespace + +namespace triton::backend::pytorch { + +ModelState::ModelState(TRITONBACKEND_Model* triton_model) + : BackendModel(triton_model), enable_optimized_execution_(true), + enable_inference_mode_(true), enable_cudnn_(true), + enable_cache_cleaning_(false), enable_weight_sharing_(false), + enable_tensor_fuser_pair_({false, true}), + enable_jit_profiling_pair_({false, true}), + enable_jit_executor_pair_({false, true}) +{ +} + +TRITONSERVER_Error* +ModelState::AutoCompleteConfig() +{ + // Auto-complete configuration is not supported since PyTorch does not + // store/capture sufficient model metadata so just log error instead. + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("skipping model configuration auto-complete for '") + + Name() + "': not supported for pytorch backend") + .c_str()); + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) +{ + try { + *state = new ModelState(triton_model); + } + catch (const BackendModelException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelException")); + RETURN_IF_ERROR(ex.err_); + } + + // Auto-complete the configuration if requested... + bool auto_complete_config = false; + RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( + triton_model, &auto_complete_config)); + if (auto_complete_config) { + RETURN_IF_ERROR((*state)->AutoCompleteConfig()); + RETURN_IF_ERROR((*state)->SetModelConfig()); + } + + auto& model_outputs = (*state)->model_outputs_; + // Parse the output states in the model configuration + triton::common::TritonJson::Value sequence_batching; + if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) { + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string output_state_name; + RETURN_IF_ERROR( + state.MemberAsString("output_name", &output_state_name)); + auto it = model_outputs.find(output_state_name); + if (it == model_outputs.end()) { + model_outputs.insert({output_state_name, std::make_pair(-1, i)}); + } else { + it->second.second = i; + } + } + } + } + + // Parse the output names in the model configuration + triton::common::TritonJson::Value outputs; + RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs)); + for (size_t i = 0; i < outputs.ArraySize(); i++) { + triton::common::TritonJson::Value output; + THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output)); + + // Use names from ModelConfig by reference since the model + // config will persist longer than this inference execution. + std::string output_name; + THROW_IF_BACKEND_INSTANCE_ERROR( + output.MemberAsString("name", &output_name)); + + auto it = model_outputs.find(output_name); + if (it == model_outputs.end()) { + model_outputs.insert({output_name, std::make_pair(i, -1)}); + } else { + it->second.first = i; + } + } + + RETURN_IF_ERROR((*state)->ParseParameters()); + + return nullptr; // success +} + +bool +ModelState::EnabledCacheCleaning() +{ + return enable_cache_cleaning_; +} + +bool +ModelState::EnabledCudnn() +{ + return enable_cudnn_; +} + +bool +ModelState::EnabledInferenceMode() +{ + return enable_inference_mode_; +} + +const std::pair& +ModelState::EnabledJitExecutor() const +{ + return enable_jit_executor_pair_; +} + +const std::pair& +ModelState::EnabledJitProfiling() const +{ + return enable_jit_profiling_pair_; +} + +bool +ModelState::EnabledOptimizedExecution() +{ + return enable_optimized_execution_; +} + +const std::pair& +ModelState::EnabledTensorExprFuser() const +{ + return enable_tensor_fuser_pair_; +} + +bool +ModelState::EnabledWeightSharing() +{ + return enable_weight_sharing_; +} + +TRITONSERVER_Error* +ModelState::LoadModel( + const std::string& artifact_name, const torch::Device device, + std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, + std::shared_ptr* torch_model) +{ + // Find the TorchScript file that describes the model. If the model + // configuration doesn't have an explicit model file specified then + // use the default name ("model.pt"). + std::string cc_model_filename = artifact_name; + if (cc_model_filename.empty()) { + cc_model_filename = "model.pt"; + } + + *model_path = JoinPath( + {RepositoryPath(), std::to_string(Version()), cc_model_filename}); + + { + bool exists; + RETURN_IF_ERROR(FileExists(*model_path, &exists)); + RETURN_ERROR_IF_FALSE( + exists, TRITONSERVER_ERROR_UNAVAILABLE, + std::string("unable to find '") + *model_path + + "' for model instance '" + Name() + "'"); + } + + // If weight sharing is enabled, skip loading model if + // it is already available on the target device + std::pair device_pair; + if (enable_weight_sharing_) { + device_pair = std::make_pair(!device.is_cpu(), device.index()); + auto mit = torch_models_.find(device_pair); + if (mit != torch_models_.end()) { + *torch_model = mit->second; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Reusing TorchScript model for instance '") + Name() + + "'") + .c_str()); + return nullptr; // success + } + } + + // Serialize the torch model to string + std::string model_data_str; + RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str)); + + // InferenceMode should be used to guard all tensors operations including + // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html + torch::InferenceMode infer_guard(EnabledInferenceMode()); + + try { + std::istringstream model_stream(model_data_str); + if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { + // Load the model without selecting a device. + torch_model->reset( + new torch::jit::Module(torch::jit::load(model_stream))); + } else { + torch_model->reset( + new torch::jit::Module(torch::jit::load(model_stream, device))); + } + } + catch (const std::exception& ex) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("failed to load model '" + Name() + "': " + ex.what()).c_str()); + } + + if (enable_weight_sharing_) { + if (!((torch_models_.emplace(device_pair, *torch_model)).second)) { + std::string type = device.is_cpu() ? "CPU" : "GPU"; + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("Model already found on target ") + type + " device " + + "(id " + std::to_string(device.index()) + ") for '" + Name() + "'") + .c_str()); + } + } + + return nullptr; // success +} + +const std::map>& +ModelState::ModelOutputs() +{ + return model_outputs_; +} + +TRITONSERVER_Error* +ModelState::ParseParameters() +{ + triton::common::TritonJson::Value params; + bool status = model_config_.Find("parameters", ¶ms); + if (status) { + // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no + // update is made to 'enable_optimized_execution_'. + bool disable_optimized_execution = false; + TRITONSERVER_Error* err = ParseParameter( + params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + enable_optimized_execution_ = !disable_optimized_execution; + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Optimized execution is ") + + (enable_optimized_execution_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then + // no update is made to 'enable_cache_cleaning_'. + err = ParseParameter( + params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Cache Cleaning is ") + + (enable_cache_cleaning_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made + // to 'enable_inference_mode_'. + err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Inference Mode is ") + + (enable_inference_mode_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made + // to 'enable_cudnn_'. + bool disable_cudnn = false; + err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + enable_cudnn_ = !disable_cudnn; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no + // update is made to 'enable_tensor_fuser'. + bool enable_tensor_fuser = false; + err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + enable_tensor_fuser_pair_ = {true, enable_tensor_fuser}; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Tensor fuser is ") + + (enable_tensor_fuser ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no + // update is made to 'enable_weight_sharing'. + err = ParseParameter( + params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Weight sharing is ") + + (enable_weight_sharing_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update + // is made to 'enable_jit_profiling'. + bool enable_jit_profiling = false; + err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + enable_jit_profiling_pair_ = {true, enable_jit_profiling}; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Jit profiling is ") + + (enable_jit_profiling ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is + // made to 'enable_jit_executor'. + bool enable_jit_executor = false; + err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + enable_jit_executor_pair_ = {true, enable_jit_executor}; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Jit executor is ") + + (enable_jit_executor ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update + // is made to 'intra_op_thread_count', which by default will take all + // threads + int intra_op_thread_count = -1; + err = + ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + if (intra_op_thread_count > 0) { + // at::set_num_threads() does not throw if called more than once, but + // issues warnings. std::call_once() is useful to limit these. + std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() { + at::set_num_threads(intra_op_thread_count); + }); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Intra op thread count is set to ") + + std::to_string(at::get_num_threads()) + " for model instance '" + + Name() + "'") + .c_str()); + } + } + + // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update + // is made to 'inter_op_thread_count', which by default will take all + // threads + int inter_op_thread_count = -1; + err = + ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + if (inter_op_thread_count > 0) { + // at::set_num_interop_threads() throws if called more than once. + // std::call_once() should prevent this, but try/catch is additionally + // used for safety. + std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() { + try { + at::set_num_interop_threads(inter_op_thread_count); + } + catch (const c10::Error& e) { + // do nothing + } + }); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Inter op thread count is set to ") + + std::to_string(at::get_num_interop_threads()) + + " for model instance '" + Name() + "'") + .c_str()); + } + } + } + + return nullptr; +} + +} // namespace triton::backend::pytorch diff --git a/src/model_state.hh b/src/model_state.hh new file mode 100644 index 0000000..1a404b8 --- /dev/null +++ b/src/model_state.hh @@ -0,0 +1,131 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include +#include +#include + +#include "libtorch_utils.h" +#include "naming_convention.hh" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + +// for thread control +// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api +// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133 +#include + + +namespace triton::backend::pytorch { + +class ModelState : public triton::backend::BackendModel { + private: + // Flag to indicate whether optimized execution is enabled. Defaults to true. + bool enable_optimized_execution_; + + // Flag to indicate whether inference mode is enabled. Defaults to false. + bool enable_inference_mode_; + + // Flag to indicate whether cudnn is enabled. Defaults to true. + bool enable_cudnn_; + + // Flag to indicate whether cache cleaning after each run is enabled. + // Defaults to false. + bool enable_cache_cleaning_; + + // Flag to indicate whether weight sharing is enabled. Defaults to false. + bool enable_weight_sharing_; + + // Flag pairs to indicate if various JIT settings are set and + // enabled respectively. Defaults to (false, true). Default behavior + // is to do nothing if not explicitly set. + std::pair enable_tensor_fuser_pair_; + std::pair enable_jit_profiling_pair_; + std::pair enable_jit_executor_pair_; + + // Model mapping for shared TorchScript model across all instances on the + // same device. The key is a pair of isGPU and device index. + std::map< + std::pair, std::shared_ptr> + torch_models_; + + // model_outputs is a map that contains unique outputs that the model must + // provide. The first pair is the model output index and the second is + // the index in the model state, -1 is used if one is not required. + // In the model configuration, the output in the state configuration + // can have intersection with the outputs section of the model. If an output + // is specified both in the output section and state section, it indicates + // that the backend must return the output state to the client too. + std::map> model_outputs_; + + public: + virtual ~ModelState() = default; + + static TRITONSERVER_Error* Create( + TRITONBACKEND_Model* triton_model, ModelState** state); + + bool EnabledCacheCleaning(); + + bool EnabledCudnn(); + + bool EnabledInferenceMode(); + + const std::pair& EnabledJitExecutor() const; + + const std::pair& EnabledJitProfiling() const; + + bool EnabledOptimizedExecution(); + + const std::pair& EnabledTensorExprFuser() const; + + bool EnabledWeightSharing(); + + TRITONSERVER_Error* LoadModel( + const std::string& artifact_name, const torch::Device device, + std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, + std::shared_ptr* torch_model); + + const std::map>& ModelOutputs(); + + private: + ModelState(TRITONBACKEND_Model* triton_model); + + TRITONSERVER_Error* AutoCompleteConfig(); + + TRITONSERVER_Error* ParseParameters(); +}; + +} // namespace triton::backend::pytorch diff --git a/src/naming_convention.hh b/src/naming_convention.hh new file mode 100644 index 0000000..756cba4 --- /dev/null +++ b/src/naming_convention.hh @@ -0,0 +1,40 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + + +namespace triton::backend::pytorch { + +// The naming convention followed for inputs/outputs in the model configuration. +// Outputs don't support FORWARD_ARGUMENT. +enum class NamingConvention { + NAMED_INDEX, + FORWARD_ARGUMENT, + STRICT_CONFIG_ORDERING +}; + +} // namespace triton::backend::pytorch diff --git a/src/string_utils.cc b/src/string_utils.cc new file mode 100644 index 0000000..a605c7c --- /dev/null +++ b/src/string_utils.cc @@ -0,0 +1,254 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "string_utils.hh" + + +namespace triton::backend::pytorch { + +// This function will return a tensor's contents as a contiguous +// chunk in system memory. In some cases this will require copying the data. +// If that happens, 'contiguous_buffer' will be set to hold the contiguous +// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is +// conducted. The data copy can be avoided if the input is already in +// a contiguous chunk and the input is located in memory type and id +// specified. +TRITONSERVER_Error* +GetContiguousInputContent( + TRITONBACKEND_Input* rinput, const uint32_t buffer_count, + const char** content, size_t* content_byte_size, + std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy) +{ + *cuda_copy = false; + + // Check input buffers to see if data copy is necessary + size_t chunk_count = 0; + bool type_mismatch = false; + uint64_t total_byte_size = 0; + for (size_t idx = 0; idx < buffer_count; ++idx) { + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + size_t src_byte_size; + const void* src_ptr; + + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, idx, &src_ptr, &src_byte_size, &src_memory_type, + &src_memory_type_id)); + + if (src_ptr != nullptr) { + chunk_count++; + total_byte_size += src_byte_size; + type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU); + } + } + + if (chunk_count == 0) { + *content = nullptr; + *content_byte_size = 0; + } else if ((chunk_count == 1) && !type_mismatch) { + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, 0, (const void**)content, content_byte_size, &src_memory_type, + &src_memory_type_id)); + } else { + contiguous_buffer->resize(total_byte_size); + + size_t offset = 0; + for (size_t i = 0; i < chunk_count; i++) { + bool cuda_used; + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + size_t src_byte_size; + const void* src_ptr; + + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, i, &src_ptr, &src_byte_size, &src_memory_type, + &src_memory_type_id)); + RETURN_IF_ERROR(CopyBuffer( + "Contiguous input", src_memory_type, src_memory_type_id, + TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr, + contiguous_buffer->data() + offset, stream, &cuda_used)); + *cuda_copy |= cuda_used; + offset += src_byte_size; + } + + *content = contiguous_buffer->data(); + *content_byte_size = total_byte_size; + } + + return nullptr; // success +} + +void +FillStringTensor(torch::List* input_list, const size_t cnt) +{ + for (size_t c = 0; c < cnt; ++c) { + input_list->push_back(""); + } +} + +bool +SetStringBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state, + const size_t tensor_element_count, cudaStream_t stream, + std::string* serialized, bool state) +{ + bool cuda_copy = false; + + // Serialize the output tensor strings. Each string is serialized as + // a 4-byte length followed by the string itself with no + // null-terminator. + serialized->clear(); + for (size_t e = 0; e < tensor_element_count; ++e) { + std::string str = tensor->get(e).to(); + const char* cstr = str.c_str(); + size_t len = str.length(); + serialized->append(reinterpret_cast(&len), sizeof(uint32_t)); + if (len > 0) { + serialized->append(cstr, len); + } + } + + // Allocate a buffer large enough to hold the serialized tensor. + TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; + int64_t actual_memory_type_id = 0; + + TRITONSERVER_Error* err; + void* buffer; + + if (!state) { + auto err = TRITONBACKEND_OutputBuffer( + response_output, &buffer, serialized->size(), &actual_memory_type, + &actual_memory_type_id); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + } else { + auto err = TRITONBACKEND_StateBuffer( + response_state, &buffer, serialized->size(), &actual_memory_type, + &actual_memory_type_id); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + } + // Copy the serialized tensor into the allocated buffer. + bool cuda_used = false; + err = CopyBuffer( + "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */, + 0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id, + serialized->size(), reinterpret_cast(serialized->c_str()), + buffer, stream, &cuda_used); + cuda_copy |= cuda_used; + + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + + if (state) { + RESPOND_AND_SET_NULL_IF_ERROR( + response, TRITONBACKEND_StateUpdate(response_state)); + } + + return cuda_copy; +} + +bool +SetStringInputTensor( + torch::List* input_list, TRITONBACKEND_Input* input, + const char* name, const uint32_t buffer_count, + const size_t request_element_cnt, TRITONBACKEND_Response** response, + cudaStream_t stream, const char* host_policy_name) +{ + bool cuda_copy = false; + + // For string data type, we always need to have the data on CPU so + // that we can read string length and construct the string + // properly. So if the request's input tensor is not in CPU need to + // copy it there. + const char* content = nullptr; + size_t content_byte_size = 0; + + std::vector contiguous_buffer; + auto err = GetContiguousInputContent( + input, buffer_count, &content, &content_byte_size, &contiguous_buffer, + stream, &cuda_copy); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + FillStringTensor(input_list, request_element_cnt); + return cuda_copy; + } + +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream); + cuda_copy = false; + } +#endif // TRITON_ENABLE_GPU + + std::vector> str_list; + err = ValidateStringBuffer( + content, content_byte_size, request_element_cnt, name, &str_list); + // Set string values. + for (const auto& [addr, len] : str_list) { + input_list->push_back(std::string(addr, len)); + } + + size_t element_cnt = str_list.size(); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + FillStringTensor(input_list, request_element_cnt - element_cnt); + } + return cuda_copy; +} + +bool +SetStringOutputBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized) +{ + return SetStringBuffer( + tensor, response, response_output, nullptr /* response_state */, + tensor_element_count, stream, serialized, false /* state */); +} + +bool +SetStringStateBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_State* response_state, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized) +{ + return SetStringBuffer( + tensor, response, nullptr /* response_output */, response_state, + tensor_element_count, stream, serialized, true /* state */); +} + +} // namespace triton::backend::pytorch diff --git a/src/string_utils.hh b/src/string_utils.hh new file mode 100644 index 0000000..8373478 --- /dev/null +++ b/src/string_utils.hh @@ -0,0 +1,106 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include +#include +#include + +#include "libtorch_utils.h" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + +#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION +// Suppress warnings in torch headers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma warning(push, 0) +#include +#include // Torchvision header +#pragma warning(pop) +#pragma GCC diagnostic pop +#endif // TRITON_PYTORCH_ENABLE_TORCHVISION + +#ifdef TRITON_ENABLE_GPU +#include +#include +#include +#endif // TRITON_ENABLE_GPU + +// for thread control +// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api +// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133 +#include + + +namespace triton::backend::pytorch { + +void FillStringTensor(torch::List* input_list, const size_t cnt); + +// This function will return a tensor's contents as a contiguous +// chunk in system memory. In some cases this will require copying the data. +// If that happens, 'contiguous_buffer' will be set to hold the contiguous +// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is +// conducted. The data copy can be avoided if the input is already in +// a contiguous chunk and the input is located in memory type and id +// specified. +TRITONSERVER_Error* GetContiguousInputContent( + TRITONBACKEND_Input* rinput, const uint32_t buffer_count, + const char** content, size_t* content_byte_size, + std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy); + +bool SetStringBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state, + const size_t tensor_element_count, cudaStream_t stream, + std::string* serialized, bool state); + +bool SetStringInputTensor( + torch::List* input_list, TRITONBACKEND_Input* input, + const char* name, const uint32_t buffer_count, + const size_t request_element_cnt, TRITONBACKEND_Response** response, + cudaStream_t stream, const char* host_policy_name); + +bool SetStringOutputBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized); + +bool SetStringStateBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_State* response_state, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized); + +} // namespace triton::backend::pytorch diff --git a/tools/gen_pb_exec_env.sh b/tools/gen_pb_exec_env.sh new file mode 100755 index 0000000..19539cd --- /dev/null +++ b/tools/gen_pb_exec_env.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# install conda +rm -rf ./miniconda +wget https://repo.anaconda.com/miniconda/Miniconda3-py312_25.7.0-2-Linux-x86_64.sh +bash Miniconda3-py312_25.7.0-2-Linux-x86_64.sh -p ./miniconda -b +eval "$(./miniconda/bin/conda shell.bash hook)" + +# create conda environment +conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main +conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r +conda create -n pt python=3.12 -y +conda activate pt +conda install -c conda-forge conda-pack -y + +# pre install step +export PYTHONNOUSERSITE=True +conda install -c conda-forge libstdcxx-ng=15 -y + +# install PyTorch +conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia -y + +# pack environment +rm -f pb_exec_env_model.py.tar.gz +conda pack -o pb_exec_env_model.py.tar.gz + +# deactivate conda +conda deactivate