diff --git a/.clang-format b/.clang-format index 98c64973..bf96a593 100644 --- a/.clang-format +++ b/.clang-format @@ -2,6 +2,7 @@ BasedOnStyle: Google IndentWidth: 2 +ColumnLimit: 80 ContinuationIndentWidth: 4 UseTab: Never MaxEmptyLinesToKeep: 2 @@ -34,4 +35,4 @@ BinPackArguments: true BinPackParameters: true ConstructorInitializerAllOnOneLineOrOnePerLine: false -IndentCaseLabels: true \ No newline at end of file +IndentCaseLabels: true diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 00000000..737725bb --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,48 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +FROM nvcr.io/nvidia/tritonserver:24.03-py3 + +ARG USERNAME=triton-server + +RUN apt-get update \ + && apt-get install -y sudo + +RUN pip3 install transformers torch + +# Create the user +RUN apt-get update \ + && apt-get install -y sudo \ + && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ + && chmod 0440 /etc/sudoers.d/$USERNAME + +RUN pip3 install pre-commit ipdb + +RUN mkhomedir_helper triton-server + +RUN apt-get install -y cmake rapidjson-dev + +USER ${USERNAME} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..e1b8bd10 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,26 @@ +{ + "name": "Python Backend", + + "build": { + "dockerfile": "Dockerfile" + }, + "customizations": { + "vscode": { + "extensions": [ + "ms-python.vscode-pylance", + "ms-python.python", + "ms-vscode.cpptools-extension-pack", + "ms-vscode.cmake-tools", + "github.vscode-pull-request-github" + ] + } + }, + "postCreateCommand": "sudo chown -R triton-server:triton-server ~/.cache", + + "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined", "--gpus=all", "--shm-size=2g", "--ulimit", "stack=67108864" ], + "mounts": [ + "source=${localEnv:HOME}/.ssh,target=/home/triton-server/.ssh,type=bind,consistency=cached", + "source=${localEnv:HOME}/.cache/huggingface,target=/home/triton-server/.cache/huggingface,type=bind,consistency=cached" + ], + "remoteUser": "triton-server" +} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index a724718d..745a3373 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -27,13 +27,7 @@ name: "CodeQL" on: - push: - branches: [ 'main' ] pull_request: - # The branches below must be a subset of the branches above - branches: [ 'main' ] - schedule: - - cron: '0 1 * * 1-6' jobs: analyze: @@ -63,12 +57,12 @@ jobs: # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. - + # Details on CodeQL's query packs refer to: # https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs queries: +security-and-quality - + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild @@ -77,7 +71,7 @@ jobs: # Command-line programs to run using the OS shell. # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - # If the Autobuild fails above, remove it and uncomment the following three lines. + # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. # - run: | diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..4fa18732 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,38 @@ +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: pre-commit + +on: + pull_request: + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5.0.0 + - uses: actions/setup-python@v6.0.0 + - uses: pre-commit/action@v3.0.1 diff --git a/.gitignore b/.gitignore index bf7e1686..419005f0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ /build -/.vscode *.so builddir @@ -139,3 +138,6 @@ dmypy.json # pytype static type analyzer .pytype/ +# vscode +.vscode/settings.json +.vscode/c_cpp_properties.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..3c76a6ed --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,73 @@ +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +repos: +- repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + additional_dependencies: [toml] +- repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + types_or: [python, cython] +- repo: https://github.com/PyCQA/flake8 + rev: 7.3.0 + hooks: + - id: flake8 + args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] + types_or: [python, cython] +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v16.0.5 + hooks: + - id: clang-format + types_or: [c, c++, cuda, proto, textproto, java] + args: ["-fallback-style=none", "-style=file", "-i"] +- repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell + additional_dependencies: [tomli] + args: ["--toml", "pyproject.toml"] + exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) +# More details about these pre-commit hooks here: +# https://pre-commit.com/hooks.html +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-case-conflict + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-json + - id: check-toml + - id: check-yaml + - id: check-shebang-scripts-are-executable + - id: end-of-file-fixer + types_or: [c, c++, cuda, proto, textproto, java, python] + - id: mixed-line-ending + - id: requirements-txt-fixer + - id: trailing-whitespace diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 00000000..597a746d --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,85 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Configure", + "type": "shell", + "command": "cmake", + "args": [ + "-DCMAKE_INSTALL_PREFIX:STRING=/opt/tritonserver/", + "-DTRITON_COMMON_REPO_TAG:STRING=main", + "-DTRITON_BACKEND_REPO_TAG:STRING=main", + "-DTRITON_CORE_REPO_TAG:STRING=main", + "-DTRITON_ENABLE_GPU:STRING=ON", + "-DTRITON_ENABLE_NVTX:STRING=ON", + "-DCMAKE_INSTALL_PREFIX:STRING=${workspaceFolder}/build/install", + "-DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE", + "-DCMAKE_BUILD_TYPE:STRING=Debug", + "-DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc", + "-DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++", + "-S${workspaceFolder}", + "-B${workspaceFolder}/build", + "-G", + "Unix Makefiles" + ], + "problemMatcher": [] + }, + { + "label": "Build", + "type": "shell", + "command": "cmake", + "args": [ + "--build", + "/${workspaceFolder}/build", + "--config", + "Debug", + "--target", + "all", + "-j", + "18", + "--" + ] + }, + { + "label": "Install", + "type": "shell", + "command": "cmake", + "args": [ + "--build", + "${workspaceFolder}/build", + "--config", + "Debug", + "--target", + "install", + "-j", + "18", + "--" + ] + }, + { + "label": "Move", + "type": "shell", + "command": "sudo", + "args": [ + "cp", + "-r", + "${workspaceFolder}/build/install/backends/python/*", + "/opt/tritonserver/backends/python" + ] + }, + { + "label": "Build Python Backend", + "dependsOrder": "sequence", + "dependsOn": [ + "Configure", + "Build", + "Install", + "Move" + ], + "group": { + "kind": "build", + "isDefault": true + } + } + ] +} diff --git a/CMakeLists.txt b/CMakeLists.txt index dcc248bf..f5c5b293 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,10 +24,13 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -cmake_minimum_required(VERSION 3.17) +cmake_minimum_required(VERSION 3.31.8) project(tritonpythonbackend LANGUAGES C CXX) +# Use C++17 standard as Triton's minimum required. +set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.") + # # Options # @@ -38,6 +41,13 @@ option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) +# FIXME: CI needs to enable the GPU flag. Python for window currently does not +# support GPU tensors. For simplicity, we will override this option here. +if(WIN32) + set(TRITON_ENABLE_GPU OFF CACHE BOOL "GPU disabled" FORCE) +endif() + +set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") @@ -49,7 +59,7 @@ endif() # # Dependencies # -# FetchContent's composibility isn't very good. We must include the +# FetchContent's composability isn't very good. We must include the # transitive closure of all repos so that we can override the tag. # include(FetchContent) @@ -60,17 +70,17 @@ include(ExternalProject) FetchContent_Declare( repo-common - GIT_REPOSITORY https://github.com/triton-inference-server/common.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} ) FetchContent_Declare( repo-core - GIT_REPOSITORY https://github.com/triton-inference-server/core.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} ) FetchContent_Declare( repo-backend - GIT_REPOSITORY https://github.com/triton-inference-server/backend.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) @@ -78,9 +88,21 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend) FetchContent_Declare( pybind11 GIT_REPOSITORY "https://github.com/pybind/pybind11" - GIT_TAG "v2.10" + # COMMIT ID for v2.12.0 + GIT_TAG "3e9dfa2866941655c56877882565e7577de6fc7b" GIT_SHALLOW ON ) + +# RHEL base container has multiple version of Python installed. By default +# it seems like pybind will pickup v3.6, so we specifically assign it to +# search for 3.12 here. +set(RHEL_BUILD OFF) +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") + set(RHEL_BUILD ON) + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) FetchContent_MakeAvailable(pybind11) # @@ -89,18 +111,23 @@ FetchContent_MakeAvailable(pybind11) FetchContent_Declare( dlpack GIT_REPOSITORY "https://github.com/dmlc/dlpack" - GIT_TAG "v0.7" + GIT_TAG "v0.8" GIT_SHALLOW ON ) +# Option must be set off so WIN32 build does not break +set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) +set(BUILD_MOCK OFF) FetchContent_MakeAvailable(dlpack) # # Boost # +set(TRITON_BOOST_URL "https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz" CACHE STRING "Boost source code URL") + ExternalProject_Add( boostorg - URL https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz - URL_HASH SHA256=273f1be93238a068aba4f9735a4a2b003019af067b9c183ed227780b8f36062c + URL ${TRITON_BOOST_URL} + URL_HASH SHA256=4b2136f98bdd1f5857f1c3dea9ac2018effe65286cf251534b6ae20cc45e1847 PREFIX "boost-src" CONFIGURE_COMMAND ${CMAKE_COMMAND} -E copy_directory /boost/ ${CMAKE_BINARY_DIR}/boost @@ -115,7 +142,7 @@ set(boostorg_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/boost/") if(${TRITON_ENABLE_GPU}) find_package(CUDAToolkit REQUIRED) message(STATUS "Using CUDA ${CUDA_VERSION}") - set(CUDA_NVCC_FLAGS -std=c++11) + set(CUDA_NVCC_FLAGS -std=c++${TRITON_MIN_CXX_STANDARD}) elseif() message(WARNING "TRITON_ENABLE_GPU is OFF, GPU Tensor support will be disabled") endif() # TRITON_ENABLE_GPU @@ -125,17 +152,24 @@ if(${TRITON_ENABLE_NVTX}) endif() # TRITON_ENABLE_NVTX find_package(ZLIB REQUIRED) -find_package(Threads REQUIRED) + +if(NOT WIN32) + find_package(Threads REQUIRED) +endif() include_directories(${CMAKE_BINARY_DIR}) configure_file(src/libtriton_python.ldscript libtriton_python.ldscript COPYONLY) set( COMMON_SRCS + src/correlation_id.cc + src/correlation_id.h src/infer_response.cc src/infer_response.h src/infer_request.cc src/infer_request.h + src/infer_trace.cc + src/infer_trace.h src/message_queue.h src/ipc_message.cc src/ipc_message.h @@ -159,24 +193,32 @@ set( src/shm_manager.h src/pb_exception.h src/pb_preferred_memory.h + src/metric.h + src/metric.cc + src/metric_family.h + src/metric_family.cc + src/gpu_buffers.cc + src/gpu_buffers.h + src/model_loader.h + src/model_loader.cc ) set( - PYTHON_BACKEND_SRCS - src/python_be.cc - src/python_be.h - src/pb_env.cc - src/pb_env.h - src/pb_metric_reporter.cc - src/pb_metric_reporter.h - src/memory_manager.cc - src/memory_manager.h - src/request_executor.cc - src/request_executor.h - src/stub_launcher.h - src/stub_launcher.cc - src/infer_payload.h - src/infer_payload.cc + PYTHON_BACKEND_SRCS + src/python_be.cc + src/python_be.h + src/pb_env.cc + src/pb_env.h + src/pb_metric_reporter.cc + src/pb_metric_reporter.h + src/memory_manager.cc + src/memory_manager.h + src/request_executor.cc + src/request_executor.h + src/stub_launcher.h + src/stub_launcher.cc + src/infer_payload.h + src/infer_payload.cc ) list(APPEND @@ -197,8 +239,14 @@ set( src/response_sender.h src/pb_stub.h src/pb_stub.cc + src/pb_stub_log.h + src/pb_stub_log.cc src/pb_response_iterator.h src/pb_response_iterator.cc + src/pb_cancel.cc + src/pb_cancel.h + src/pb_bls_cancel.cc + src/pb_bls_cancel.h ) list(APPEND @@ -220,53 +268,104 @@ add_library( TritonPythonBackend::triton-python-backend ALIAS triton-python-backend ) -target_compile_features(triton-python-backend PRIVATE cxx_std_11) +target_compile_features(triton-python-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-python-backend PRIVATE $<$,$,$>: - -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + -Wall -Wextra -Wno-unused-parameter -Wno-type-limits> + $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) -target_compile_features(triton-python-backend-stub PRIVATE cxx_std_11) +target_compile_features(triton-python-backend-stub PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-python-backend-stub PRIVATE $<$,$,$>: - -fvisibility=hidden -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + -fvisibility=hidden -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) target_compile_definitions(triton-python-backend-stub PRIVATE TRITON_PB_STUB) -target_link_libraries( - triton-python-backend - PRIVATE +# RHEL assets are not released in a container environment nor do the current +# Python lib versions in the manylinux base container match those currently +# available for RHEL8 package managers. Therefore, we package the correct +# python libs in the backend folder and adjust the stub executable to look +# in its own folder at runtime. +if(RHEL_BUILD) + set_target_properties( + triton-python-backend-stub + PROPERTIES + SKIP_BUILD_RPATH TRUE + BUILD_WITH_INSTALL_RPATH TRUE + INSTALL_RPATH_USE_LINK_PATH FALSE + INSTALL_RPATH "$\{ORIGIN\}" + ) +endif(RHEL_BUILD) + + +# For WIN32 do not link Threads and DL_LIBS +if(WIN32) + target_link_libraries( + triton-python-backend + PRIVATE + dlpack + triton-backend-utils # from repo-backend + -lrt # shared memory + triton-core-serverstub # from repo-core + ZLIB::ZLIB + -larchive + ) + + target_link_libraries( + triton-python-backend-stub + PRIVATE + dlpack + triton-backend-utils # from repo-backend + pybind11::embed + -lrt # shared memory + -larchive # libarchive + ) +else() + target_link_libraries( + triton-python-backend + PRIVATE + dlpack + Threads::Threads + triton-backend-utils # from repo-backend + ${CMAKE_DL_LIBS} # dlopen and dlclose + -lrt # shared memory + triton-core-serverstub # from repo-core + ZLIB::ZLIB + -larchive + ) + + target_link_libraries( + triton-python-backend-stub + PRIVATE dlpack Threads::Threads - triton-backend-utils # from repo-backend - ${CMAKE_DL_LIBS} # dlopen and dlclose - -lrt # shared memory - triton-core-serverstub # from repo-core - ZLIB::ZLIB - -larchive -) - -target_link_libraries( - triton-python-backend-stub - PRIVATE - dlpack - Threads::Threads - triton-backend-utils # from repo-backend - ${CMAKE_DL_LIBS} # dlopen and dlclose - pybind11::embed - -lrt # shared memory - -larchive # libarchive -) + triton-backend-utils # from repo-backend + ${CMAKE_DL_LIBS} # dlopen and dlclose + pybind11::embed + -lrt # shared memory + -larchive # libarchive + ) +endif() -set_target_properties( - triton-python-backend PROPERTIES - POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_python - LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_python.ldscript - LINK_FLAGS "-Wl,--version-script libtriton_python.ldscript" -) +if(WIN32) + set_target_properties( + triton-python-backend PROPERTIES + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME triton_python + ) +else() + set_target_properties( + triton-python-backend PROPERTIES + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME triton_python + LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_python.ldscript + LINK_FLAGS "-Wl,--version-script libtriton_python.ldscript" + ) +endif() add_subdirectory(./src/shm_monitor) diff --git a/README.md b/README.md index 5a6f4952..dd5e877a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ + +# Custom Metrics Example + +In this section we demonstrate an end-to-end example for +[Custom Metrics API](../../README.md#custom-metrics) in Python backend. The +[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md) +should contain [custom_metrics](./model.py) model. The +[custom_metrics](./model.py) model uses +[Custom Metrics API](../../README.md#custom-metrics) to register and collect +custom metrics. + +## Deploying the Custom Metrics Models + +1. Create the model repository: + +```console +mkdir -p models/custom_metrics/1/ + +# Copy the Python models +cp examples/custom_metrics/model.py models/custom_metrics/1/model.py +cp examples/custom_metrics/config.pbtxt models/custom_metrics/config.pbtxt +``` + +2. Start the tritonserver: + +``` +tritonserver --model-repository `pwd`/models +``` + +3. Send inference requests to server: + +``` +python3 examples/custom_metrics/client.py +``` + +You should see an output similar to the output below in the client terminal: + +``` +custom_metrics example: found pattern '# HELP requests_process_latency_ns Cumulative time spent processing requests' in metrics +custom_metrics example: found pattern '# TYPE requests_process_latency_ns counter' in metrics +custom_metrics example: found pattern 'requests_process_latency_ns{model="custom_metrics",version="1"}' in metrics +PASS: custom_metrics +``` + +In the terminal that runs Triton Server, you should see an output similar to +the output below: +``` +Cumulative requests processing latency: 223406.0 +``` + +The [model.py](./model.py) model file is heavily commented with +explanations about each of the function calls. + +### Explanation of the Client Output + +The [client.py](./client.py) sends a HTTP request with url +`http://localhost:8002/metrics` to fetch the metrics from Triton server. The +client then verifies if the custom metrics added in the model file are +correctly reported. diff --git a/examples/custom_metrics/client.py b/examples/custom_metrics/client.py new file mode 100644 index 00000000..64ae31e4 --- /dev/null +++ b/examples/custom_metrics/client.py @@ -0,0 +1,98 @@ +# Copyright 2023, NVIDIA CORPORATION& AFFILIATES.All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and / or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +import numpy as np +import requests +import tritonclient.http as httpclient +from tritonclient.utils import * + +model_name = "custom_metrics" +shape = [4] + + +def get_metrics(): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + +with httpclient.InferenceServerClient("localhost:8000") as client: + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + inputs = [ + httpclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + httpclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), + ] + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0"), + httpclient.InferRequestedOutput("OUTPUT1"), + ] + + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) + + output0_data = response.as_numpy("OUTPUT0") + output1_data = response.as_numpy("OUTPUT1") + + if not np.allclose(input0_data + input1_data, output0_data): + print("custom_metrics example error: incorrect sum") + sys.exit(1) + + if not np.allclose(input0_data - input1_data, output1_data): + print("custom_metrics example error: incorrect difference") + sys.exit(1) + + metrics = get_metrics() + patterns = [ + "# HELP requests_process_latency_ns Cumulative time spent processing requests", + "# TYPE requests_process_latency_ns counter", + 'requests_process_latency_ns{model="custom_metrics",version="1"}', + ] + for pattern in patterns: + if pattern not in metrics: + print( + "custom_metrics example error: missing pattern '{}' in metrics".format( + pattern + ) + ) + sys.exit(1) + else: + print( + "custom_metrics example: found pattern '{}' in metrics".format(pattern) + ) + + print("PASS: custom_metrics") + sys.exit(0) diff --git a/examples/custom_metrics/config.pbtxt b/examples/custom_metrics/config.pbtxt new file mode 100644 index 00000000..2a8192c3 --- /dev/null +++ b/examples/custom_metrics/config.pbtxt @@ -0,0 +1,65 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "custom_metrics" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] + +instance_group [ + { + count: 3 + kind: KIND_CPU + } +] + diff --git a/examples/custom_metrics/model.py b/examples/custom_metrics/model.py new file mode 100644 index 00000000..ad3b4e6f --- /dev/null +++ b/examples/custom_metrics/model.py @@ -0,0 +1,174 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + + # Parse model_config and extract OUTPUT0 and OUTPUT1 configuration + self.model_config = model_config = json.loads(args["model_config"]) + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + # Convert Triton types to numpy types + self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"]) + self.out1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"]) + + # Create a MetricFamily object to report the latency of the model + # execution. The 'kind' parameter must be either 'COUNTER' or + # 'GAUGE'. + # If duplicate name is used, both MetricFamily objects + # will reference to the same underlying MetricFamily. If there are two + # MetricFamily objects with the same name and same kind but different + # description, the original description will be used. Note that + # Duplicate name with different kind is not allowed. + self.metric_family = pb_utils.MetricFamily( + name="requests_process_latency_ns", + description="Cumulative time spent processing requests", + kind=pb_utils.MetricFamily.COUNTER, # or pb_utils.MetricFamily.GAUGE + ) + + # Create a Metric object under the MetricFamily object. The 'labels' + # is a dictionary of key-value pairs. You can create multiple Metric + # objects under the same MetricFamily object with unique labels. Empty + # labels is allowed. The 'labels' parameter is optional. If you don't + # specify the 'labels' parameter, empty labels will be used. + self.metric = self.metric_family.Metric( + labels={"model": "custom_metrics", "version": "1"} + ) + + def execute(self, requests): + """`execute` MUST be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Record the start time of processing the requests + start_ns = time.time_ns() + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for request in requests: + # Get INPUT0 + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + # Get INPUT1 + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.out0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.out1_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1] + ) + responses.append(inference_response) + + # Record the end time of processing the requests + end_ns = time.time_ns() + + # Update metric to track cumulative requests processing latency. + # There are three operations you can do with the Metric object: + # - Metric.increment(value): Increment the value of the metric by + # the given value. The type of the value is double. The 'COUNTER' + # kind does not support negative value. + # - Metric.set(value): Set the value of the metric to the given + # value. This operation is only supported in 'GAUGE' kind. The + # type of the value is double. + # - Metric.value(): Get the current value of the metric. + self.metric.increment(end_ns - start_ns) + logger = pb_utils.Logger + logger.log_info( + "Cumulative requests processing latency: {}".format(self.metric.value()) + ) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...") diff --git a/examples/decoupled/README.md b/examples/decoupled/README.md index 22f4f68e..4301961e 100644 --- a/examples/decoupled/README.md +++ b/examples/decoupled/README.md @@ -1,5 +1,5 @@ + +# Preprocessing Using Python Backend Example This example shows how to preprocess your inputs using Python backend before it is passed to the TensorRT model for inference. This ensemble model includes an image preprocessing model (preprocess) and a TensorRT model (resnet50_trt) to do inference. **1. Converting PyTorch Model to ONNX format:** Run onnx_exporter.py to convert ResNet50 PyTorch model to ONNX format. Width and height dims are fixed at 224 but dynamic axes arguments for dynamic batching are used. Commands from the 2. and 3. subsections shall be executed within this Docker container. - $ docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/pytorch:xx.yy-py3 bash - $ pip install numpy pillow torchvision - $ python onnx_exporter.py --save model.onnx - + docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/pytorch:xx.yy-py3 bash + pip install numpy pillow torchvision + python onnx_exporter.py --save model.onnx + **2. Create the model repository:** - $ mkdir -p model_repository/ensemble_python_resnet50/1 - $ mkdir -p model_repository/preprocess/1 - $ mkdir -p model_repository/resnet50_trt/1 - + mkdir -p model_repository/ensemble_python_resnet50/1 + mkdir -p model_repository/preprocess/1 + mkdir -p model_repository/resnet50_trt/1 + # Copy the Python model - $ cp model.py model_repository/preprocess/1 + cp model.py model_repository/preprocess/1 **3. Build a TensorRT engine for the ONNX model** Set the arguments for enabling fp16 precision --fp16. To enable dynamic shapes use --minShapes, --optShapes, and maxShapes with --explicitBatch: - $ trtexec --onnx=model.onnx --saveEngine=./model_repository/resnet50_trt/1/model.plan --explicitBatch --minShapes=input:1x3x224x224 --optShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --fp16 + trtexec --onnx=model.onnx --saveEngine=./model_repository/resnet50_trt/1/model.plan --explicitBatch --minShapes=input:1x3x224x224 --optShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --fp16 **4. Run the command below to start the server container:** Under python_backend/examples/preprocessing, run this command to start the server docker container: - $ docker run --gpus=all -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v$(pwd):/workspace/ -v/$(pwd)/model_repository:/models nvcr.io/nvidia/tritonserver:xx.yy-py3 bash - $ pip install numpy pillow torchvision - $ tritonserver --model-repository=/models - + docker run --gpus=all -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v$(pwd):/workspace/ -v/$(pwd)/model_repository:/models nvcr.io/nvidia/tritonserver:xx.yy-py3 bash + pip install numpy pillow torchvision + tritonserver --model-repository=/models + **5. Start the client to test:** Under python_backend/examples/preprocessing, run the commands below to start the client Docker container: - $ wget https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg -O "mug.jpg" - $ docker run --rm --net=host -v $(pwd):/workspace/ nvcr.io/nvidia/tritonserver:xx.yy-py3-sdk python client.py --image mug.jpg - $ The result of classification is:COFFEE MUG + wget https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg -O "mug.jpg" + docker run --rm --net=host -v $(pwd):/workspace/ nvcr.io/nvidia/tritonserver:xx.yy-py3-sdk python client.py --image mug.jpg + The result of classification is:COFFEE MUG Here, since we input an image of "mug" and the inference result is "COFFEE MUG" which is correct. diff --git a/examples/preprocessing/client.py b/examples/preprocessing/client.py index dc0ebf0d..1ac107af 100644 --- a/examples/preprocessing/client.py +++ b/examples/preprocessing/client.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,53 +24,59 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os, sys -import numpy as np -import json -import tritongrpcclient import argparse +import json +import sys + +import numpy as np +import tritonclient.grpc as tritongrpcclient def load_image(img_path: str): """ Loads an encoded image as an array of bytes. - + """ - return np.fromfile(img_path, dtype='uint8') + return np.fromfile(img_path, dtype="uint8") if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model_name", - type=str, - required=False, - default="ensemble_python_resnet50", - help="Model name") - parser.add_argument("--image", - type=str, - required=True, - help="Path to the image") - parser.add_argument("--url", - type=str, - required=False, - default="localhost:8001", - help="Inference server URL. Default is localhost:8001.") - parser.add_argument('-v', - "--verbose", - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') + parser.add_argument( + "--model_name", + type=str, + required=False, + default="ensemble_python_resnet50", + help="Model name", + ) + parser.add_argument("--image", type=str, required=True, help="Path to the image") + parser.add_argument( + "--url", + type=str, + required=False, + default="localhost:8001", + help="Inference server URL. Default is localhost:8001.", + ) + parser.add_argument( + "-v", + "--verbose", + action="http://github.com/store_true", + required=False, + default=False, + help="Enable verbose output", + ) parser.add_argument( "--label_file", type=str, default="./model_repository/resnet50_trt/labels.txt", - help="Path to the file with text representation of available labels") + help="Path to the file with text representation of available labels", + ) args = parser.parse_args() try: triton_client = tritongrpcclient.InferenceServerClient( - url=args.url, verbose=args.verbose) + url=args.url, verbose=args.verbose + ) except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) @@ -85,14 +91,13 @@ def load_image(img_path: str): image_data = load_image(args.image) image_data = np.expand_dims(image_data, axis=0) - inputs.append( - tritongrpcclient.InferInput(input_name, image_data.shape, "UINT8")) + inputs.append(tritongrpcclient.InferInput(input_name, image_data.shape, "UINT8")) outputs.append(tritongrpcclient.InferRequestedOutput(output_name)) inputs[0].set_data_from_numpy(image_data) - results = triton_client.infer(model_name=args.model_name, - inputs=inputs, - outputs=outputs) + results = triton_client.infer( + model_name=args.model_name, inputs=inputs, outputs=outputs + ) output0_data = results.as_numpy(output_name) print(output0_data) diff --git a/examples/preprocessing/model.py b/examples/preprocessing/model.py index d4117e2f..90259978 100644 --- a/examples/preprocessing/model.py +++ b/examples/preprocessing/model.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,20 +24,18 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import numpy as np -import sys -import json import io +import json + +import numpy as np +import torchvision.transforms as transforms # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config # and converting Triton input/output types to numpy types. import triton_python_backend_utils as pb_utils - from PIL import Image -import torchvision.transforms as transforms -import os class TritonPythonModel: @@ -48,7 +46,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -63,15 +61,15 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT_0") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT_0") # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) def execute(self, requests): """`execute` MUST be implemented in every Python model. `execute` @@ -105,18 +103,22 @@ def execute(self, requests): # Get INPUT0 in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT_0") - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) - loader = transforms.Compose([ - transforms.Resize([224, 224]), - transforms.CenterCrop(224), - transforms.ToTensor(), normalize - ]) + loader = transforms.Compose( + [ + transforms.Resize([224, 224]), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ] + ) def image_loader(image_name): image = loader(image_name) - #expand the dimension to nchw + # expand the dimension to nchw image = image.unsqueeze(0) return image @@ -126,8 +128,7 @@ def image_loader(image_name): img_out = image_loader(image) img_out = np.array(img_out) - out_tensor_0 = pb_utils.Tensor("OUTPUT_0", - img_out.astype(output0_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT_0", img_out.astype(output0_dtype)) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -135,9 +136,10 @@ def image_loader(image_name): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0]) + output_tensors=[out_tensor_0] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -149,4 +151,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/preprocessing/model_repository/preprocess/config.pbtxt b/examples/preprocessing/model_repository/preprocess/config.pbtxt index 1125dea3..fcfbd93b 100644 --- a/examples/preprocessing/model_repository/preprocess/config.pbtxt +++ b/examples/preprocessing/model_repository/preprocess/config.pbtxt @@ -26,7 +26,7 @@ name: "preprocess" backend: "python" -max_batch_size: 256 +max_batch_size: 256 input [ { name: "INPUT_0" @@ -34,7 +34,7 @@ input [ dims: [ -1 ] } ] - + output [ { name: "OUTPUT_0" diff --git a/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt b/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt index d464d582..a4b94402 100644 --- a/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt +++ b/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt @@ -32,7 +32,7 @@ input [ name: "input" data_type: TYPE_FP32 dims: [3, -1, -1 ] - + } ] output[ diff --git a/examples/preprocessing/model_repository/resnet50_trt/labels.txt b/examples/preprocessing/model_repository/resnet50_trt/labels.txt index e59113f7..2376a285 100644 --- a/examples/preprocessing/model_repository/resnet50_trt/labels.txt +++ b/examples/preprocessing/model_repository/resnet50_trt/labels.txt @@ -517,7 +517,7 @@ COWBOY HAT CRADLE CRANE CRASH HELMET -CRATE +CREATE CRIB CROCK POT CROQUET BALL diff --git a/examples/preprocessing/onnx_exporter.py b/examples/preprocessing/onnx_exporter.py index 9148e4e6..3be47b57 100644 --- a/examples/preprocessing/onnx_exporter.py +++ b/examples/preprocessing/onnx_exporter.py @@ -24,11 +24,12 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import torch -import torchvision.models as models import argparse import os +import torch +import torchvision.models as models + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--save", default="model.onnx") @@ -38,23 +39,19 @@ dummy_input = torch.randn(1, 3, 224, 224) resnet50 = resnet50.eval() - torch.onnx.export(resnet50, - dummy_input, - args.save, - export_params=True, - opset_version=10, - do_constant_folding=True, - input_names=['input'], - output_names=['output'], - dynamic_axes={ - 'input': { - 0: 'batch_size', - 2: "height", - 3: 'width' - }, - 'output': { - 0: 'batch_size' - } - }) + torch.onnx.export( + resnet50, + dummy_input, + args.save, + export_params=True, + opset_version=10, + do_constant_folding=True, + input_names=["input"], + output_names=["output"], + dynamic_axes={ + "input": {0: "batch_size", 2: "height", 3: "width"}, + "output": {0: "batch_size"}, + }, + ) print("Saved {}".format(args.save)) diff --git a/examples/pytorch/client.py b/examples/pytorch/client.py index ee29b5fe..af1abd39 100644 --- a/examples/pytorch/client.py +++ b/examples/pytorch/client.py @@ -24,10 +24,11 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient import sys + import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * model_name = "pytorch" shape = [4] @@ -36,10 +37,12 @@ input0_data = np.random.rand(*shape).astype(np.float32) input1_data = np.random.rand(*shape).astype(np.float32) inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, - np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, - np_to_triton_dtype(input1_data.dtype)), + httpclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + httpclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), ] inputs[0].set_data_from_numpy(input0_data) @@ -50,19 +53,22 @@ httpclient.InferRequestedOutput("OUTPUT1"), ] - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() output0_data = response.as_numpy("OUTPUT0") output1_data = response.as_numpy("OUTPUT1") - print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data)) - print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output1_data)) + print( + "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data + ) + ) + print( + "INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output1_data + ) + ) if not np.allclose(input0_data + input1_data, output0_data): print("pytorch example error: incorrect sum") @@ -72,5 +78,5 @@ print("pytorch example error: incorrect difference") sys.exit(1) - print('PASS: pytorch') + print("PASS: pytorch") sys.exit(0) diff --git a/examples/pytorch/model.py b/examples/pytorch/model.py index 3383acc0..89b0c8a2 100644 --- a/examples/pytorch/model.py +++ b/examples/pytorch/model.py @@ -25,13 +25,13 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json -from torch import nn # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config # and converting Triton input/output types to numpy types. import triton_python_backend_utils as pb_utils +from torch import nn class AddSubNet(nn.Module): @@ -55,7 +55,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -70,21 +70,21 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") # Get OUTPUT1 configuration - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) # Instantiate the PyTorch model self.add_sub_model = AddSubNet() @@ -128,10 +128,8 @@ def execute(self, requests): # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - out_tensor_0 = pb_utils.Tensor("OUTPUT0", - out_0.astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", - out_1.astype(output1_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -139,9 +137,10 @@ def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0, out_tensor_1]) + output_tensors=[out_tensor_0, out_tensor_1] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -153,4 +152,4 @@ def finalize(self): Implementing `finalize` function is optional. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/inferentia/README.md b/inferentia/README.md index 50d443e0..fb0de4f7 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -1,5 +1,5 @@ -# Using Triton with Inferentia +# Using Triton with Inferentia 1 Starting from 21.11 release, Triton supports -[AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) +[AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) and the [Neuron Runtime](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-intro/get-started.html). ## Table of Contents -- [Using Triton with Inferentia](#using-triton-with-inferentia) +- [Using Triton with Inferentia 1](#using-triton-with-inferentia-1) - [Table of Contents](#table-of-contents) - [Inferentia setup](#inferentia-setup) - [Setting up the Inferentia model](#setting-up-the-inferentia-model) @@ -59,56 +59,46 @@ After logging into the inf1* instance, you will need to clone or simply clone with https. Clone this repo with Github to home repo `/home/ubuntu`. -Ensure that the neuron runtime 1.0 demon (neuron-rtd) is not running and set up -and install neuron 2.X runtime builds with ``` - $chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh - $sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh ``` Then, start the Triton instance with: -``` - $docker run --device /dev/neuron0 -v /home/ubuntu/python_backend:/home/ubuntu/python_backend -v /lib/udev:/mylib/udev --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:-py3 +``` + docker run --device /dev/neuron0 -v /home/ubuntu/python_backend:/home/ubuntu/python_backend -v /lib/udev:/mylib/udev --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:-py3 ``` Note 1: The user would need to list any neuron device to run during container initialization. For example, to use 4 neuron devices on an instance, the user would need to run with: ``` - $docker run --device /dev/neuron0 --device /dev/neuron1 --device /dev/neuron2 --device /dev/neuron3 ...` + docker run --device /dev/neuron0 --device /dev/neuron1 --device /dev/neuron2 --device /dev/neuron3 ...` ``` -Note 2: `/mylib/udev` is used for Neuron parameter passing. +Note 2: `/mylib/udev` is used for Neuron parameter passing. -Note 3: For Triton container version xx.yy, please refer to +Note 3: For Triton container version xx.yy, please refer to [Triton Inference Server Container Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html). - The current build script has been tested with container version `21.10`. + The current build script has been tested with container version `21.10`. After starting the Triton container, go into the `python_backend` folder and run the setup script. ``` - $source /home/ubuntu/python_backend/inferentia/scripts/setup.sh + source /home/ubuntu/python_backend/inferentia/scripts/setup.sh ``` This script will: -1. Setup miniconda enviroment -2. Install necessary dependencies -3. Create a [Custom Python Execution Environment](https://github.com/triton-inference-server/python_backend#using-custom-python-execution-environments), - `python_backend_stub` to use for Inferentia -4. Install [neuron-cc](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/index.html), - the Neuron compiler and [neuron-rtd](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-runtime/overview.html) the Neuron Runtime +1. Install necessary dependencies +2. Install [neuron-cc](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/index.html), the Neuron compiler. +3. Install neuron framework packages as per your preference e.g., either pytorch, or tensorflow or both. -There are user configurable options available for the script as well. -For example, to control the python version for the python environment to 3.6, -you can run: -``` - $source /home/ubuntu/python_backend/inferentia/scripts/setup.sh -v 3.6 -``` +There are user configurable options available for the script as well. Please use the `-h` or `--help` options to learn about more configurable options. ## Setting up the Inferentia model Currently, we only support [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html) and [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/index.html) -workflows for execution on inferentia. +workflows for execution on inferentia. -The user is required to create their own `*.pt` (for pytorch) or `*.savedmodels` -(for tensorflow) models. This is a critical step since Inferentia will need +The user is required to create their own `*.pt` (for pytorch) or `*.savedmodels` +(for tensorflow) models. This is a critical step since Inferentia will need the underlying `.NEFF` graph to execute the inference request. Please refer to: - [Neuron compiler CLI Reference Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/command-line-reference.html) @@ -128,7 +118,7 @@ triton python model directory. An example invocation for the `gen_triton_model.py` for PyTorch model can look like: ``` - $python3 inferentia/scripts/gen_triton_model.py --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model /home/ubuntu/bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4 + python3 inferentia/scripts/gen_triton_model.py --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model /home/ubuntu/bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4 ``` In order for the script to treat the compiled model as TorchScript @@ -171,7 +161,7 @@ script to generate triton python model directory. An example invocation for the `gen_triton_model.py` for TensorFlow model can look like: ``` - $python3 gen_triton_model.py --model_type tensorflow --compiled_model /home/ubuntu/inferentia-poc-2.0/scripts-rn50-tf-native/resnet50_mlperf_opt_fp16_compiled_b5_nc1/1 --neuron_core_range 0:3 --triton_model_dir rn50-1neuroncores-bs1x1 + python3 gen_triton_model.py --model_type tensorflow --compiled_model /home/ubuntu/inferentia-poc-2.0/scripts-rn50-tf-native/resnet50_mlperf_opt_fp16_compiled_b5_nc1/1 --neuron_core_range 0:3 --triton_model_dir rn50-1neuroncores-bs1x1 ``` NOTE: Unlike TorchScript model, TensorFlow SavedModel stores sufficient @@ -225,7 +215,7 @@ a valid torchscript file or tensorflow savedmodel. Now, the server can be launched with the model as below: ``` - $tritonserver --model-repository + tritonserver --model-repository ``` Note: @@ -238,13 +228,13 @@ their need. ### Using Triton's Dynamic Batching To enable dynamic batching, `--enable_dynamic_batching` -flag needs to be specified. `gen_triton_model.py` supports following three +flag needs to be specified. `gen_triton_model.py` supports following three options for configuring [Triton's dynamic batching](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md): 1. `--preferred_batch_size`: Please refer to [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#preferred-batch-sizes) for details on preferred batch size. To optimize performance, this is recommended to be multiples of engaged neuron cores. For example, if each instance is using 2 neuron cores, `preferred_batch_size` - could be 2, 4 or 6. + could be 2, 4 or 6. 2. `--max_queue_delay_microseconds`: Please refer to [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#delayed-batching) for details. 3. `--disable_batch_requests_to_neuron`: Enable the non-default way for Triton to @@ -265,8 +255,8 @@ contains the necessary files to set up testing with a simple add_sub model. The requires an instance with more than 8 inferentia cores to run, eg:`inf1.6xlarge`. start the test, run ``` - $source /python_backend/inferentia/qa/setup_test_enviroment_and_test.sh -``` + source /python_backend/inferentia/qa/setup_test_enviroment_and_test.sh +``` where `` is usually `/home/ubuntu`/. This script will pull the [server repo](https://github.com/triton-inference-server/server) that contains the tests for inferentia. It will then build the most recent @@ -275,6 +265,86 @@ Triton Server and Triton SDK. Note: If you would need to change some of the tests in the server repo, you would need to run ``` - $export TRITON_SERVER_REPO_TAG= -``` + export TRITON_SERVER_REPO_TAG= +``` before running the script. + +# Using Triton with Inferentia 2, or Trn1 +## pytorch-neuronx and tensorflow-neuronx +1. Similar to the steps for inf1, change the argument to the pre-container and on-container setup scripts to include the `-inf2` or `-trn1`flags e.g., +``` + chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh -inf2 +``` +2. On the container, followed by the `docker run` command, you can pass similar argument to the setup.sh script +For Pytorch: +``` +source /home/ubuntu/python_backend/inferentia/scripts/setup.sh -inf2 -p +``` +For Tensorflow: +``` +source /home/ubuntu/python_backend/inferentia/scripts/setup.sh -inf2 -t +``` +3. Following the above steps, when using the `gen_triton_model.py` script, you can pass similar argument `--inf2` to the setup.sh script e.g., for Pytorch +``` +python3 inferentia/scripts/gen_triton_model.py --inf2 --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4 +``` +4. **Note**: When using the `--inf2` option, the `--compiled_model` path should be provided relative to the triton model directory. The `initialize()` function in model.py will derive the full path by concatenating the model path within the repository and the relative `--compiled_model` path. +## transformers-neuronx +To use inf2/trn1 instances with transformers-neuronx packages for serving models, generate a `pytorch` model as per above instructions. The transformers-neuronx currently supports the models listed [here](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/transformers-neuronx/readme.html#currently-supported-models). + +As prescribed on the neuronx documentation page, while the neuronx load API differs per model, it follows the same pattern. + +1. To serve transformers-neuronx models, first trace the model using `save_pretrained_split()` API on an inf2 instance (recommend inf2.24xl for Large Language Models). Following that, package the folder as the '--compiled_model' when using `gen_triton_model.py` file. +2. The following tree shows a sample model structure for OPT model: +``` +opt/ +├── 1 +│   └── model.py +├── opt-125m-model +│   └── pytorch_model.bin +└── opt-125m-tp12 + ├── FullyUnrolled.1814.1 + │   ├── penguin-sg0000 + │   └── sg00 + ├── FullyUnrolled.1814.2 + │   ├── penguin-sg0000 + │   └── sg00 + ├── FullyUnrolled.1814.3 + │   ├── penguin-sg0000 + │   └── sg00 + ├── FullyUnrolled.1814.4 + │   ├── penguin-sg0000 + │   └── sg00 + └── FullyUnrolled.1814.5 + ├── penguin-sg0000 + └── sg00 + ├── config.pbtxt +``` + +3. Add the following imports (e.g., for OPT model). The import will differ as per the model you're trying to run. +``` +from transformers_neuronx.opt.model import OPTForSampling +``` + +4. Add the following lines in `initialize()` function. Set the `batch_size`, `tp_degree`, `n_positions`, `amp` and `unroll` args as per your requirement. `tp_degree` should typically match the number of neuron cores available on inf2 instance. +``` +batch_size = 1 +tp_degree = 12 +n_positions = 2048 +amp = 'bf16' +unroll = None +self.model_neuron = OPTForSampling.from_pretrained(compiled_model, batch_size=batch_size, amp=amp, tp_degree=tp_degree, n_positions=n_positions, unroll=unroll) +self.model_neuron.to_neuron() + +self.model_neuron.num_workers = num_threads +``` +You may also chose to add the `batch_size` etc. arguments to config.pbtxt as parameters and read them in the `initialize()` function similar to `--compiled-model`. + +5. Finally, in the `execute()` function, use the following API to run the inference: +``` +batched_results = self.model_neuron.sample(batched_tensor, 2048) +``` +Above, `2048` is a sufficiently-long output token. It may also be passed in as one of the inputs if you wanto specify it as part of the payload. + +6. Proceed to load the model, and submit the inference payload similar to any other triton model. \ No newline at end of file diff --git a/inferentia/qa/setup_test_enviroment_and_test.sh b/inferentia/qa/setup_test_enviroment_and_test.sh old mode 100644 new mode 100755 index 7972dae7..cf6057ac --- a/inferentia/qa/setup_test_enviroment_and_test.sh +++ b/inferentia/qa/setup_test_enviroment_and_test.sh @@ -51,7 +51,7 @@ UPSTREAM_CONTAINER_VERSION="" USAGE=" usage: setup_test_enviroment_and_test.sh [options]. These setting will override exported variables -Setup enviroment for testing on Inferentia chips and run perf analyzer tests. +Setup environment for testing on Inferentia chips and run perf analyzer tests. -h|--help Shows usage -d|--default-repo-tag DEFAULT_REPO_TAG for building the test container. Default is main -s|--server-repo-tag TRITON_SERVER_REPO_TAG for building test container. Default same DEFAULT_REPO_TAG @@ -113,7 +113,7 @@ cd ${TRITON_PATH}/server git clone --single-branch --depth=1 -b ${TRITON_CLIENT_REPO_TAG} \ https://github.com/triton-inference-server/client.git clientrepo -# First set up inferentia and run in detatched mode +# First set up inferentia and run in detached mode cd ${TRITON_PATH}/python_backend chmod 777 ${TRITON_PATH}/python_backend/inferentia/scripts/setup-pre-container.sh sudo ${TRITON_PATH}/python_backend/inferentia/scripts/setup-pre-container.sh @@ -131,7 +131,7 @@ if [ "${UPSTREAM_CONTAINER_VERSION}" = "" ]; then echo "found upstream container version: ${UPSTREAM_CONTAINER_VERSION} from build.py" fi -# Build container with only python backend +# Build container with only python backend cd ${TRITON_PATH}/server pip3 install docker ./build.py --container-version=${CONTAINER_VERSION} \ diff --git a/inferentia/scripts/gen_triton_model.py b/inferentia/scripts/gen_triton_model.py index 0e8a4e10..caa2450c 100644 --- a/inferentia/scripts/gen_triton_model.py +++ b/inferentia/scripts/gen_triton_model.py @@ -30,6 +30,7 @@ def tf_to_triton_dtype(dtype): import tensorflow as tf + if dtype == tf.float16: return "FP16" elif dtype == tf.float32: @@ -62,12 +63,13 @@ def tf_to_triton_dtype(dtype): def parse_tf_tensors(saved_model_dir, tag_set, signature_def_key): from tensorflow.python.tools import saved_model_utils - meta_graph_def = saved_model_utils.get_meta_graph_def( - saved_model_dir, tag_set) + + meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir, tag_set) input_dict = {} input_signatures = list( - meta_graph_def.signature_def[signature_def_key].inputs.values()) + meta_graph_def.signature_def[signature_def_key].inputs.values() + ) for input_signature in input_signatures: datatype = tf_to_triton_dtype(input_signature.dtype) shape = [] @@ -77,7 +79,8 @@ def parse_tf_tensors(saved_model_dir, tag_set, signature_def_key): output_dict = {} output_signatures = list( - meta_graph_def.signature_def[signature_def_key].outputs.values()) + meta_graph_def.signature_def[signature_def_key].outputs.values() + ) for output_signature in output_signatures: datatype = tf_to_triton_dtype(output_signature.dtype) shape = [] @@ -98,61 +101,81 @@ def parse_io_tensors(tensors): def get_parameter_spec(key1, value): - param_spec = "parameters: {{key: \"{}\", value: {{string_value: \"{}\"}}}} \n".format( - key1, value) + param_spec = 'parameters: {{key: "{}", value: {{string_value: "{}"}}}} \n'.format( + key1, value + ) return param_spec -def create_modelconfig(model_name, max_batch_size, inputs, outputs, - compiled_model_path, nc_start_idx, nc_end_idx, - threads_per_core, instance_count, - enable_dynamic_batching, preferred_batch_size, - max_queue_delay_microseconds): - config = "name: \"{}\"\n".format(model_name) - config += "backend: \"python\"\n" +def create_modelconfig( + model_name, + max_batch_size, + inputs, + outputs, + compiled_model_path, + nc_start_idx, + nc_end_idx, + threads_per_core, + instance_count, + enable_dynamic_batching, + preferred_batch_size, + max_queue_delay_microseconds, +): + config = 'name: "{}"\n'.format(model_name) + config += 'backend: "python"\n' config += "max_batch_size: {}\n".format(max_batch_size) if enable_dynamic_batching: - config += ''' + config += """ dynamic_batching { -''' +""" if preferred_batch_size is not None: - config += ''' + config += """ preferred_batch_size: {} -'''.format(preferred_batch_size) +""".format( + preferred_batch_size + ) if max_queue_delay_microseconds is not None: - config += ''' + config += """ max_queue_delay_microseconds: {} -'''.format(max_queue_delay_microseconds) - config += ''' -}\n''' +""".format( + max_queue_delay_microseconds + ) + config += """ +}\n""" for input_name in inputs.keys(): data_type, shape = inputs[input_name] - config += ''' + config += """ input [ {{ name: \"{}\" data_type: {} dims: {} }} -]\n'''.format(input_name, "TYPE_" + data_type, shape) +]\n""".format( + input_name, "TYPE_" + data_type, shape + ) for output_name in outputs.keys(): data_type, shape = outputs[output_name] - config += ''' + config += """ output [ {{ name: \"{}\" data_type: {} dims: {} }} -]\n'''.format(output_name, "TYPE_" + data_type, shape) - config += ''' +]\n""".format( + output_name, "TYPE_" + data_type, shape + ) + config += """ instance_group [ {{ kind: KIND_MODEL count: {} }} -]\n'''.format(instance_count) +]\n""".format( + instance_count + ) config += get_parameter_spec("COMPILED_MODEL", compiled_model_path) config += get_parameter_spec("NEURON_CORE_START_INDEX", nc_start_idx) config += get_parameter_spec("NEURON_CORE_END_INDEX", nc_end_idx) @@ -161,7 +184,7 @@ def create_modelconfig(model_name, max_batch_size, inputs, outputs, def get_model_license(): - lic = '''# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + lic = """# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -186,7 +209,7 @@ def get_model_license(): # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - ''' + """ return lic @@ -195,7 +218,7 @@ def get_common_initialize_impl(): def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -231,6 +254,7 @@ def initialize(self, args): params = model_config['parameters'] compiled_model = params['COMPILED_MODEL']['string_value'] + nc_start_idx = int(params['NEURON_CORE_START_INDEX']['string_value']) nc_end_idx = int(params['NEURON_CORE_END_INDEX']['string_value']) if nc_end_idx < nc_start_idx: @@ -255,9 +279,9 @@ def initialize(self, args): return init_impl -def get_tensorflow_initialize_impl(): +def get_tensorflow_initialize_impl(is_inf2=False): init_impl = get_common_initialize_impl() - init_impl += ''' + init_impl += """ self.input_list = [] for config_input in model_config['input']: self.input_list.append( @@ -270,21 +294,28 @@ def get_tensorflow_initialize_impl(): (config_output['name'], config_output['data_type'], config_output['dims'])) - # TODO: Validate input/output from the model - os.environ["NEURON_RT_NUM_CORES"] = str(cores_per_instance) - +""" + if is_inf2: + init_impl += """ + compiled_model = os.path.join(args['model_repository'], compiled_model) + self.pred_list = [ + tf.keras.models.load_model(compiled_model) + for _ in range(cores_per_instance) + ] * threads_per_core +""" + else: + init_impl += """ self.pred_list = [ tf.contrib.predictor.from_saved_model(compiled_model) for _ in range(cores_per_instance) ] * threads_per_core - -''' +""" return init_impl -def get_pytorch_initialize_impl(): - init_impl = ''' +def get_pytorch_initialize_impl(is_inf2=False): + init_impl = """ def _validate_and_get_index(self, name): parts = name.split('__') if len(parts) != 2: @@ -310,9 +341,9 @@ def _validate_output_dict(self, expected_count): if i not in self.output_dict: raise pb_utils.TritonModelException( "output corresponding to index {} not found".format(i)) -''' +""" init_impl += get_common_initialize_impl() - init_impl += ''' + init_impl += """ self.input_dict = {} expected_input_count = 0 for config_input in model_config['input']: @@ -340,12 +371,20 @@ def _validate_output_dict(self, expected_count): os.environ["NEURON_RT_VISIBLE_CORES"] = cores_range consumed_cores_list = [i for i in range(cores_per_instance)] - +""" + if is_inf2: + init_impl += """ + compiled_model = os.path.join(args['model_repository'], compiled_model) + self.model_neuron = torch.jit.load(compiled_model) +""" + else: + init_impl += """ self.model_neuron = torch.neuron.DataParallel( - torch.jit.load(compiled_model), device_ids=consumed_cores_list) + torch.jit.load(compiled_model), device_ids=consumed_cores_list) +""" + init_impl += """ self.model_neuron.num_workers = num_threads - -''' +""" return init_impl @@ -378,7 +417,7 @@ def execute(self, requests): """ ''' if disable_batch_requests_to_neuron: - exec_impl += ''' + exec_impl += """ responses = [] num_threads = len(self.pred_list) model_feed_dict_list = [{} for _ in range(num_threads)] @@ -420,9 +459,9 @@ def execute(self, requests): output_tensors=output_tensors) responses.append(inference_response) return responses -''' +""" else: - exec_impl += ''' + exec_impl += """ responses = [] num_threads = len(self.pred_list) model_feed_dict_list = [{} for _ in range(num_threads)] @@ -467,7 +506,7 @@ def execute(self, requests): full_tensor = np.concatenate( (full_tensor, out_list[idx + 1]), axis=0) chuncky_tensors.append(np.split(full_tensor, request_batch_sizes, axis=0)) - + for i in range(num_requests): output_tensors = [] for j in range(len(self.output_list)): @@ -482,7 +521,7 @@ def execute(self, requests): responses.append(inference_response) return responses -''' +""" return exec_impl @@ -511,7 +550,7 @@ def execute(self, requests): """ ''' if disable_batch_requests_to_neuron: - exec_impl += ''' + exec_impl += """ responses = [] for request in requests: inputs = [] @@ -533,9 +572,9 @@ def execute(self, requests): output_tensors=output_tensors) responses.append(inference_response) return responses -''' +""" else: - exec_impl += ''' + exec_impl += """ responses = [] inputs = [] num_requests = len(requests) @@ -572,7 +611,7 @@ def execute(self, requests): responses.append(inference_response) return responses -''' +""" return exec_impl @@ -589,8 +628,9 @@ def finalize(self): return finalize_impl -def get_triton_python_model_impl(using_tensorflow_model, - disable_batch_requests_to_neuron): +def get_triton_python_model_impl( + using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2=False +): triton_pmi = ''' class TritonPythonModel: """Your Python model must use the same class name. Every Python model @@ -599,11 +639,10 @@ class TritonPythonModel: ''' if using_tensorflow_model: - triton_pmi += get_tensorflow_initialize_impl() - triton_pmi += get_tensorflow_execute_impl( - disable_batch_requests_to_neuron) + triton_pmi += get_tensorflow_initialize_impl(is_inf2) + triton_pmi += get_tensorflow_execute_impl(disable_batch_requests_to_neuron) else: - triton_pmi += get_pytorch_initialize_impl() + triton_pmi += get_pytorch_initialize_impl(is_inf2) triton_pmi += get_pytorch_execute_impl(disable_batch_requests_to_neuron) triton_pmi += get_finalize_impl() @@ -611,124 +650,154 @@ class TritonPythonModel: return triton_pmi -def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron): +def create_model_file( + using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2=False +): triton_model = get_model_license() - triton_model += ''' + triton_model += """ import json import numpy as np import os import sys import triton_python_backend_utils as pb_utils -''' +""" if using_tensorflow_model: - triton_model += ''' + triton_model += """ import tensorflow as tf from concurrent import futures -''' +""" else: - triton_model += ''' + triton_model += """ import torch + """ + if not is_inf2: + triton_model += """ import torch.neuron - ''' + """ + else: + triton_model += """ +import torch_neuronx +""" triton_model += get_triton_python_model_impl( - using_tensorflow_model, disable_batch_requests_to_neuron) + using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2 + ) return triton_model -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--model_type', - type=str, - required=True, - choices=['pytorch', 'tensorflow'], - help='''The type of the compiled model. Currently, - only supports \"pytorch\" and \"tensorflow\".''') - parser.add_argument('--model_version', - type=int, - default=1, - help='The version of the model') parser.add_argument( - '--enable_dynamic_batching', + "--inf2", + required=False, + default=False, action="http://github.com/store_true", - help='''Enable dynamic batching. Please see model configuration - documentation for details: - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#dynamic-batcher''' + help="Specify whether the model should be generate for inf2 or inf1, default is inf1", + ) + parser.add_argument( + "--model_type", + type=str, + required=True, + choices=["pytorch", "tensorflow"], + help="""The type of the compiled model. Currently, + only supports \"pytorch\" and \"tensorflow\".""", ) parser.add_argument( - '--max_batch_size', + "--model_version", type=int, default=1, help="The version of the model" + ) + parser.add_argument( + "--enable_dynamic_batching", + action="http://github.com/store_true", + help="""Enable dynamic batching. Please see model configuration + documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#dynamic-batcher""", + ) + parser.add_argument( + "--max_batch_size", type=int, default=0, - help='''The maximum batch size for the model being generated. - Please see model configuration documentation for details: - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size''' + help="""The maximum batch size for the model being generated. + Please see model configuration documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size""", ) - parser.add_argument('--preferred_batch_size', - type=int, - help='''The preferred batch size. Should be multiples + parser.add_argument( + "--preferred_batch_size", + type=int, + help="""The preferred batch size. Should be multiples of cores available to ensure proper utilization of - neuron cores. - This flag is ignored if --enable_dynamic_batching is - not specified. Please see model configuration - documentation for details: - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#preferred-batch-sizes''' - ) - parser.add_argument('--max_queue_delay_microseconds', - type=int, - help='''Max queue delay time(ms) for dynamic batching. - This flag is ignored if --enable_dynamic_batching is not specified. - Please see model configuration documentation for details: - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#delayed-batching''' - ) + neuron cores. + This flag is ignored if --enable_dynamic_batching is + not specified. Please see model configuration + documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#preferred-batch-sizes""", + ) + parser.add_argument( + "--max_queue_delay_microseconds", + type=int, + help="""Max queue delay time(ms) for dynamic batching. + This flag is ignored if --enable_dynamic_batching is not specified. + Please see model configuration documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#delayed-batching""", + ) parser.add_argument( - '--disable_batch_requests_to_neuron', + "--disable_batch_requests_to_neuron", action="http://github.com/store_true", - help='''Send each request separately to neuron if enabled. - If not specified, then requests are combined and sent to - neuron as a single batch''') - parser.add_argument('--tag_set', - type=str, - default="serve", - help='''The tag set to use for the TF model. + help="""Send each request separately to neuron if enabled. + If not specified, then requests are combined and sent to + neuron as a single batch""", + ) + parser.add_argument( + "--tag_set", + type=str, + default="serve", + help="""The tag set to use for the TF model. This option is ignored if `--model_type` is - not \"tensorflow\". Default value is \'serve\'.''') - parser.add_argument('--signature_def_key', - type=str, - default="serving_default", - help='''The signature def key to use for the TF + not \"tensorflow\". Default value is \'serve\'.""", + ) + parser.add_argument( + "--signature_def_key", + type=str, + default="serving_default", + help="""The signature def key to use for the TF model. This option is ignored if `--model_type` is not \"tensorflow\". Default value - is \'serving_default\'.''') - parser.add_argument('--compiled_model', - type=str, - required=True, - help='Fullpath to the compiled model') + is \'serving_default\'.""", + ) parser.add_argument( - '--triton_input', + "--compiled_model", type=str, - action='http://github.com/append', + required=True, + help="Fullpath to the compiled model", + ) + parser.add_argument( + "--triton_input", + type=str, + action="http://github.com/append", nargs="*", - help='''The name, datatype and shape of the model input in + help="""The name, datatype and shape of the model input in format ,,. This option can be provided multiple times for multiple inputs. For example, to provide a FP16 input with shape [1,384] specify the following: INPUT0,FP16,1x384. - This option is not required when using tensorflow model''') + This option is not required when using tensorflow model""", + ) parser.add_argument( - '--triton_output', + "--triton_output", type=str, - action='http://github.com/append', + action="http://github.com/append", nargs="*", - help='''The name, datatype and shape of the model output in + help="""The name, datatype and shape of the model output in format ,,. This option can be provided multiple times for multiple outputs. For example, to provide a FP16 output with shape [1,384] specify the following: OUTPUT0,FP16,1x384. - This option is not required when using tensorflow model''') - parser.add_argument('--neuron_core_range', - type=str, - required=True, - help='''The range of neuron core indices + This option is not required when using tensorflow model""", + ) + parser.add_argument( + "--neuron_core_range", + type=str, + required=True, + help="""The range of neuron core indices where the model needs to be loaded. The range should be specified in format :. For example to @@ -740,49 +809,61 @@ def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron): loaded on cores 0:1, Instance1 will get loaded on cores 2:3, Instance2 will get loaded on cores 4:5 and Instance 3 will get loaded on - cores 6:7''') - parser.add_argument('--threads_per_core', - type=int, - default=1, - help='The number of threads per neuron core.') - parser.add_argument('--triton_model_instance_count', - type=int, - default=1, - help='The number of triton model instances.') - parser.add_argument('--triton_model_dir', - type=str, - required=True, - help='''Path to the triton model + cores 6:7""", + ) + parser.add_argument( + "--threads_per_core", + type=int, + default=1, + help="The number of threads per neuron core.", + ) + parser.add_argument( + "--triton_model_instance_count", + type=int, + default=1, + help="The number of triton model instances.", + ) + parser.add_argument( + "--triton_model_dir", + type=str, + required=True, + help="""Path to the triton model directory where script will generate - config.pbtxt and model.py''') + config.pbtxt and model.py""", + ) FLAGS, unparsed = parser.parse_known_args() if len(unparsed) > 0: raise Exception("Unrecognized options: {}".format(unparsed)) - if FLAGS.model_type == 'tensorflow': + if FLAGS.model_type == "tensorflow": is_tensorflow_model = True - elif FLAGS.model_type == 'pytorch': + elif FLAGS.model_type == "pytorch": is_tensorflow_model = False - print('''Triton Dynamic Batching is enabled: {}, - preferred_batch_size: {} and max_batch_size: {} - with max_queue_delay_microseconds: {}. - Batch requests to neruon are disabled: {}'''.format( - FLAGS.enable_dynamic_batching, FLAGS.preferred_batch_size, - FLAGS.max_batch_size, FLAGS.max_queue_delay_microseconds, - FLAGS.disable_batch_requests_to_neuron)) + print( + """Triton Dynamic Batching is enabled: {}, + preferred_batch_size: {} and max_batch_size: {} + with max_queue_delay_microseconds: {}. + Batch requests to neruon are disabled: {}""".format( + FLAGS.enable_dynamic_batching, + FLAGS.preferred_batch_size, + FLAGS.max_batch_size, + FLAGS.max_queue_delay_microseconds, + FLAGS.disable_batch_requests_to_neuron, + ) + ) - if not is_tensorflow_model or (FLAGS.triton_input != None and - FLAGS.triton_output != None): + if not is_tensorflow_model or ( + FLAGS.triton_input != None and FLAGS.triton_output != None + ): inputs = parse_io_tensors(FLAGS.triton_input) outputs = parse_io_tensors(FLAGS.triton_output) else: - inputs, outputs = parse_tf_tensors(FLAGS.compiled_model, FLAGS.tag_set, - FLAGS.signature_def_key) + inputs, outputs = parse_tf_tensors( + FLAGS.compiled_model, FLAGS.tag_set, FLAGS.signature_def_key + ) - nc_start_idx, nc_end_idx = [ - int(i) for i in FLAGS.neuron_core_range.split(":") - ] + nc_start_idx, nc_end_idx = [int(i) for i in FLAGS.neuron_core_range.split(":")] model_version_dir = FLAGS.triton_model_dir + "/" + str(FLAGS.model_version) try: @@ -792,14 +873,26 @@ def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron): model_name = os.path.basename(FLAGS.triton_model_dir) mc = create_modelconfig( - model_name, FLAGS.max_batch_size, inputs, outputs, FLAGS.compiled_model, - nc_start_idx, nc_end_idx, FLAGS.threads_per_core, - FLAGS.triton_model_instance_count, FLAGS.enable_dynamic_batching, - FLAGS.preferred_batch_size, FLAGS.max_queue_delay_microseconds) + model_name, + FLAGS.max_batch_size, + inputs, + outputs, + FLAGS.compiled_model, + nc_start_idx, + nc_end_idx, + FLAGS.threads_per_core, + FLAGS.triton_model_instance_count, + FLAGS.enable_dynamic_batching, + FLAGS.preferred_batch_size, + FLAGS.max_queue_delay_microseconds, + ) with open(FLAGS.triton_model_dir + "/config.pbtxt", "w") as config_file: config_file.write(mc) - mf = create_model_file(is_tensorflow_model, - FLAGS.disable_batch_requests_to_neuron) + is_inf2 = FLAGS.inf2 + + mf = create_model_file( + is_tensorflow_model, FLAGS.disable_batch_requests_to_neuron, is_inf2 + ) with open(FLAGS.triton_model_dir + "/1/model.py", "w") as model_file: model_file.write(mf) diff --git a/inferentia/scripts/setup-pre-container.sh b/inferentia/scripts/setup-pre-container.sh old mode 100644 new mode 100755 index f95ee4ce..f6f5ae16 --- a/inferentia/scripts/setup-pre-container.sh +++ b/inferentia/scripts/setup-pre-container.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,11 +24,75 @@ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#! /bin/sh + +USAGE=" +usage: setup.sh [options] + +Sets up runtime and tools for execution on Inferentia chips. +-h|--help Shows usage +-inf1|--inf1-setup Installs runtime and tools for inf1/neuron, inf1 is default +-inf2|--inf2-setup Installs runtime and tools for inf2/neuronx +-trn1|--trn1-setup Installs runtime, tools for inf2, and installs EFA for trn1 +" + +# Get all options: +OPTS=$(getopt -o hb:v:i:tp --long help,python-backend-path:,python-version:,inferentia-path:,use-tensorflow,use-pytorch,tensorflow-version: -- "$@") + + +export INSTALL_INF2=0 +export INSTALL_INF1=1 +export INSTALL_TRN1=0 + +export CWD=`pwd` + cd /home/ubuntu +for OPTS; do + case "$OPTS" in + -h|--help) + printf "%s\\n" "$USAGE" + return 0 + ;; + -inf1|--inf1-setup) + INSTALL_INF1=1 + echo "Script will install runtime and tools for inf1/neuron" + shift 1 + ;; + -inf2|--inf2-setup) + INSTALL_INF2=1 + shift 1 + echo "Script will install runtime and tools for inf2/neruonx" + ;; + -trn1|--trn1-setup) + INSTALL_TRN1=1 + echo "Script will install runtime and tools for trn1" + shift 1 + ;; + esac +done + +if [ ${INSTALL_INF1} -ne 1 ] && [ ${INSTALL_INF2} -ne 1 ]; then + echo "Error: need to specify either -inf1, -inf2 or -trn1." + printf "source %s\\n" "$USAGE" + return 1 +fi + +if [ ${INSTALL_INF1} -eq 1 ] && [ ${INSTALL_INF2} -eq 1] +then + echo "Error: cannot install both inf1 and inf2 dependencies. Please select either -inf1 or -inf2." + return 1 +fi + +if [ ${INSTALL_INF1} -eq 1 ] && [ ${INSTALL_TRN1} -eq 1 ] +then + echo "Error: cannot install both inf1 and trn1 dependencies. Selecting -trn1 will install inf2 dependencies and EFA." +fi + # First stop and remove old neuron 1.X runtime -sudo systemctl stop neuron-rtd -sudo apt remove aws-neuron-runtime -y +sudo systemctl stop neuron-rtd || true +sudo apt remove aws-neuron-runtime -y || true # Then install new neuron libraries . /etc/os-release @@ -36,9 +100,31 @@ sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null </dev/null | \ - gpg --dearmor - | \ - tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \ -apt-add-repository 'deb https://apt.kitware.com/ubuntu/ focal main' && \ -apt-get update && \ -apt-get install -y --no-install-recommends \ -cmake-data=3.21.1-0kitware1ubuntu20.04.1 cmake=3.21.1-0kitware1ubuntu20.04.1 && \ -cmake --version - -# Create Conda Enviroment -conda create -q -y -n test_conda_env python=${PYTHON_VERSION} -source ${CONDA_PATH}/bin/activate test_conda_env - -# First compile correct python stub -cd ${PYTHON_BACKEND_PATH} -rm -rf build && mkdir build && cd build -cmake -DTRITON_ENABLE_GPU=ON -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. -make triton-python-backend-stub -j16 # Set Pip repository to point to the Neuron repository -# since we need to use pip to update: +# since we need to use pip to update: # https://aws.amazon.com/blogs/developer/neuron-conda-packages-eol/ pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com -conda config --env --add channels https://conda.repos.neuron.amazonaws.com +pip install --upgrade pip + +if [ ${INSTALL_INF2} -eq 1 ];then + # Install Neuron Runtime + # Then install new neuron libraries + . /etc/os-release + tee /etc/apt/sources.list.d/neuron.list > /dev/null <& correlation_id_shm) +{ + id_uint_ = correlation_id_shm->id_uint_; + id_type_ = correlation_id_shm->id_type_; + id_string_ = correlation_id_shm->id_string_; +} + +CorrelationId& +CorrelationId::operator=(const CorrelationId& rhs) +{ + id_uint_ = rhs.id_uint_; + id_type_ = rhs.id_type_; + id_string_ = rhs.id_string_; + return *this; +} + +void +CorrelationId::SaveToSharedMemory( + std::unique_ptr& shm_pool) +{ + AllocatedSharedMemory correlation_id_shm = + shm_pool->Construct(); + correlation_id_shm_ptr_ = correlation_id_shm.data_.get(); + + std::unique_ptr id_string_shm = + PbString::Create(shm_pool, id_string_); + + correlation_id_shm_ptr_->id_uint = id_uint_; + correlation_id_shm_ptr_->id_string_shm_handle = id_string_shm->ShmHandle(); + correlation_id_shm_ptr_->id_type = id_type_; + + // Save the references to shared memory. + correlation_id_shm_ = std::move(correlation_id_shm); + id_string_shm_ = std::move(id_string_shm); + shm_handle_ = correlation_id_shm_.handle_; +} + +std::unique_ptr +CorrelationId::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory correlation_id_shm = + shm_pool->Load(handle); + CorrelationIdShm* correlation_id_shm_ptr = correlation_id_shm.data_.get(); + + std::unique_ptr id_string_shm = PbString::LoadFromSharedMemory( + shm_pool, correlation_id_shm_ptr->id_string_shm_handle); + + return std::unique_ptr( + new CorrelationId(correlation_id_shm, id_string_shm)); +} + +CorrelationId::CorrelationId( + AllocatedSharedMemory& correlation_id_shm, + std::unique_ptr& id_string_shm) + : correlation_id_shm_(std::move(correlation_id_shm)), + id_string_shm_(std::move(id_string_shm)) +{ + correlation_id_shm_ptr_ = correlation_id_shm_.data_.get(); + shm_handle_ = correlation_id_shm_.handle_; + id_string_ = id_string_shm_->String(); + id_uint_ = correlation_id_shm_ptr_->id_uint; + id_type_ = correlation_id_shm_ptr_->id_type; +} + +}}}; // namespace triton::backend::python diff --git a/src/correlation_id.h b/src/correlation_id.h new file mode 100644 index 00000000..63185d9f --- /dev/null +++ b/src/correlation_id.h @@ -0,0 +1,93 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include "pb_string.h" +#include "pb_utils.h" + +namespace triton { namespace backend { namespace python { + +enum class CorrelationIdDataType { UINT64, STRING }; + +struct CorrelationIdShm { + bi::managed_external_buffer::handle_t id_string_shm_handle; + uint64_t id_uint; + CorrelationIdDataType id_type; +}; + +class CorrelationId { + public: + CorrelationId(); + CorrelationId(const std::string& id_string); + CorrelationId(uint64_t id_uint); + CorrelationId(const CorrelationId& rhs); + CorrelationId(std::unique_ptr& correlation_id_shm); + CorrelationId& operator=(const CorrelationId& rhs); + + /// Save CorrelationId object to shared memory. + /// \param shm_pool Shared memory pool to save the CorrelationId object. + void SaveToSharedMemory(std::unique_ptr& shm_pool); + + /// Create a CorrelationId object from shared memory. + /// \param shm_pool Shared memory pool + /// \param handle Shared memory handle of the CorrelationId. + /// \return Returns the CorrelationId in the specified handle + /// location. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); + + // Function that help determine exact type of Correlation Id + CorrelationIdDataType Type() const { return id_type_; } + + // Get the value of the CorrelationId based on the type + const std::string& StringValue() const { return id_string_; } + uint64_t UnsignedIntValue() const { return id_uint_; } + + bi::managed_external_buffer::handle_t ShmHandle() { return shm_handle_; } + + private: + // The private constructor for creating a CorrelationId object from shared + // memory. + CorrelationId( + AllocatedSharedMemory& correlation_id_shm, + std::unique_ptr& id_string_shm); + + std::string id_string_; + uint64_t id_uint_; + CorrelationIdDataType id_type_; + + // Shared Memory Data Structures + AllocatedSharedMemory correlation_id_shm_; + CorrelationIdShm* correlation_id_shm_ptr_; + bi::managed_external_buffer::handle_t shm_handle_; + std::unique_ptr id_string_shm_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/gpu_buffers.cc b/src/gpu_buffers.cc new file mode 100644 index 00000000..4b1b0f9f --- /dev/null +++ b/src/gpu_buffers.cc @@ -0,0 +1,89 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gpu_buffers.h" + +#include "pb_string.h" + +namespace triton { namespace backend { namespace python { +GPUBuffersHelper::GPUBuffersHelper() +{ + completed_ = false; +} + +void +GPUBuffersHelper::AddBuffer(const bi::managed_external_buffer::handle_t& handle) +{ + if (completed_) { + throw PythonBackendException( + "It is not possible to add buffers after 'Complete' has been called on " + "a GPUBuffersHelper."); + } + + buffers_.emplace_back(handle); +} + +void +GPUBuffersHelper::SetError( + std::unique_ptr& shm_pool, const std::string& error) +{ + error_shm_ = PbString::Create(shm_pool, error); +} + +void +GPUBuffersHelper::Complete(std::unique_ptr& shm_pool) +{ + if (completed_) { + throw PythonBackendException( + "Complete has already been called. Complete should only be called " + "once."); + } + gpu_buffers_shm_ = shm_pool->Construct(); + if (!error_shm_) { + buffers_handle_shm_ = + shm_pool->Construct( + buffers_.size()); + gpu_buffers_shm_.data_->buffer_count = buffers_.size(); + gpu_buffers_shm_.data_->success = true; + gpu_buffers_shm_.data_->buffers = buffers_handle_shm_.handle_; + for (size_t i = 0; i < buffers_.size(); ++i) { + buffers_handle_shm_.data_.get()[i] = buffers_[i]; + } + } else { + gpu_buffers_shm_.data_->success = false; + gpu_buffers_shm_.data_->error = error_shm_->ShmHandle(); + } + completed_ = true; +} + + +bi::managed_external_buffer::handle_t +GPUBuffersHelper::ShmHandle() +{ + return gpu_buffers_shm_.handle_; +} + +}}} // namespace triton::backend::python diff --git a/src/gpu_buffers.h b/src/gpu_buffers.h new file mode 100644 index 00000000..fd683ba7 --- /dev/null +++ b/src/gpu_buffers.h @@ -0,0 +1,67 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "pb_string.h" +#include "pb_utils.h" +#include "scoped_defer.h" + +namespace triton { namespace backend { namespace python { + +/// \param success indicating whether the process of fetching the GPU buffers +/// was successful. +/// \param error if success is equal to false, the error object will be set. +/// \param buffers list of buffers elements. +/// \param buffer_count the number of buffers. +struct GPUBuffersShm { + bool success; + bi::managed_external_buffer::handle_t error; + bi::managed_external_buffer::handle_t buffers; + uint32_t buffer_count; +}; + +/// Helper class to facilitate transfer of metadata associated +/// the GPU buffers in shared memory. +class GPUBuffersHelper { + public: + GPUBuffersHelper(); + void AddBuffer(const bi::managed_external_buffer::handle_t& handle); + void Complete(std::unique_ptr& shm_pool); + void SetError( + std::unique_ptr& shm_pool, const std::string& error); + bi::managed_external_buffer::handle_t ShmHandle(); + + private: + AllocatedSharedMemory gpu_buffers_shm_; + std::vector buffers_; + AllocatedSharedMemory + buffers_handle_shm_; + std::unique_ptr error_shm_; + bool completed_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/infer_payload.cc b/src/infer_payload.cc index a61335a7..6baad307 100644 --- a/src/infer_payload.cc +++ b/src/infer_payload.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -31,30 +31,33 @@ namespace triton { namespace backend { namespace python { InferPayload::InferPayload( const bool is_decoupled, std::function)> callback) - : is_decoupled_(is_decoupled), is_promise_set_(false), callback_(callback) + : is_decoupled_(is_decoupled), is_promise_set_(false), callback_(callback), + request_address_(reinterpret_cast(nullptr)) { - prev_promise_.reset(new std::promise>()); -} - -InferPayload::~InferPayload() -{ - prev_promise_.reset(); + promise_.reset(new std::promise>()); } void -InferPayload::SetValueForPrevPromise( - std::unique_ptr infer_response) +InferPayload::SetValue(std::unique_ptr infer_response) { - prev_promise_->set_value(std::move(infer_response)); - prev_promise_.reset(); - is_promise_set_ = true; + { + // Only set value to the promise with the first response. Call the callback + // function to send decoupled response to the stub. + std::lock_guard lock(mutex_); + if (!is_promise_set_) { + is_promise_set_ = true; + promise_->set_value(std::move(infer_response)); + return; + } + } + Callback(std::move(infer_response)); } void InferPayload::SetFuture( std::future>& response_future) { - response_future = prev_promise_->get_future(); + response_future = promise_->get_future(); } bool @@ -89,4 +92,31 @@ InferPayload::ResponseAllocUserp() return response_alloc_userp_; } +void +InferPayload::SetRequestAddress(intptr_t request_address) +{ + std::unique_lock lock(request_address_mutex_); + request_address_ = request_address; +} + +void +InferPayload::SetRequestCancellationFunc( + const std::function& request_cancel_func) +{ + request_cancel_func_ = request_cancel_func; +} + +void +InferPayload::SafeCancelRequest() +{ + std::unique_lock lock(request_address_mutex_); + if (request_address_ == 0L) { + return; + } + + if (request_cancel_func_) { + request_cancel_func_(request_address_); + } +} + }}} // namespace triton::backend::python diff --git a/src/infer_payload.h b/src/infer_payload.h index 5c0458a5..8e4aa7d3 100644 --- a/src/infer_payload.h +++ b/src/infer_payload.h @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -28,6 +28,7 @@ #include #include + #include "infer_response.h" #include "pb_preferred_memory.h" @@ -43,14 +44,17 @@ struct ResponseAllocatorUserp { PreferredMemory preferred_memory; }; -class InferPayload { +class InferPayload : public std::enable_shared_from_this { public: InferPayload( const bool is_decouple, std::function)> callback); - ~InferPayload(); - void SetValueForPrevPromise(std::unique_ptr infer_response); + /// GetPtr should be only called when the InferPayload object is constructed + /// using a shared pointer. Calling this function in any other circumstance + /// is undefined behaviour until C++17. + std::shared_ptr GetPtr() { return shared_from_this(); } + void SetValue(std::unique_ptr infer_response); void SetFuture(std::future>& response_future); bool IsDecoupled(); bool IsPromiseSet(); @@ -58,13 +62,21 @@ class InferPayload { void SetResponseAllocUserp( const ResponseAllocatorUserp& response_alloc_userp); std::shared_ptr ResponseAllocUserp(); + void SetRequestAddress(intptr_t request_address); + void SetRequestCancellationFunc( + const std::function& request_cancel_func); + void SafeCancelRequest(); private: - std::unique_ptr>> prev_promise_; + std::unique_ptr>> promise_; bool is_decoupled_; + std::mutex mutex_; bool is_promise_set_; std::function)> callback_; std::shared_ptr response_alloc_userp_; + std::mutex request_address_mutex_; + intptr_t request_address_; + std::function request_cancel_func_; }; }}} // namespace triton::backend::python diff --git a/src/infer_request.cc b/src/infer_request.cc index 2a9799db..e5733662 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -28,6 +28,7 @@ #include +#include "gpu_buffers.h" #include "pb_utils.h" #include "scoped_defer.h" #ifdef TRITON_PB_STUB @@ -37,18 +38,19 @@ namespace triton { namespace backend { namespace python { InferRequest::InferRequest( - const std::string& request_id, uint64_t correlation_id, + const std::string& request_id, const CorrelationId& correlation_id, const std::vector>& inputs, const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, - const std::string& parameters, const uint32_t flags, const int32_t timeout, + const std::string& parameters, const uint32_t flags, const uint64_t timeout, const intptr_t response_factory_address, const intptr_t request_address, - const PreferredMemory& preferred_memory) + const PreferredMemory& preferred_memory, const InferenceTrace& trace) : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs), requested_output_names_(requested_output_names), model_name_(model_name), model_version_(model_version), parameters_(parameters), flags_(flags), timeout_(timeout), response_factory_address_(response_factory_address), - request_address_(request_address), preferred_memory_(preferred_memory) + request_address_(request_address), preferred_memory_(preferred_memory), + trace_(trace), request_release_flags_(TRITONSERVER_REQUEST_RELEASE_ALL) { for (auto& input : inputs) { if (!input) { @@ -66,12 +68,13 @@ InferRequest::InferRequest( } } - inputs_ = inputs; - requested_output_names_ = requested_output_names; #ifdef TRITON_PB_STUB + pb_cancel_ = + std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( - request_address_, response_factory_address_, - Stub::GetOrCreateInstance()->SharedMemory()); + request_address_, response_factory_address_, nullptr /* is_decoupled */, + RequestedOutputNames(), Stub::GetOrCreateInstance()->SharedMemory(), + pb_cancel_); #endif } @@ -93,8 +96,8 @@ InferRequest::RequestId() return request_id_; } -uint64_t -InferRequest::CorrelationId() +CorrelationId& +InferRequest::GetCorrelationId() { return correlation_id_; } @@ -141,7 +144,7 @@ InferRequest::ShmHandle() return shm_handle_; } -int32_t +uint64_t InferRequest::Timeout() { return timeout_; @@ -165,6 +168,26 @@ InferRequest::GetPreferredMemory() return preferred_memory_; } +InferenceTrace& +InferRequest::GetTrace() +{ + return trace_; +} + +uint32_t +InferRequest::ReleaseFlags() +{ + request_release_flags_ = infer_request_shm_ptr_->request_release_flags; + return request_release_flags_; +} + +void +InferRequest::SetReleaseFlags(const uint32_t& flags) +{ + request_release_flags_ = flags; + infer_request_shm_ptr_->request_release_flags = request_release_flags_; +} + void InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) { @@ -172,14 +195,10 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) sizeof(InferRequestShm) + (RequestedOutputNames().size() * sizeof(bi::managed_external_buffer::handle_t)) + - (Inputs().size() * sizeof(bi::managed_external_buffer::handle_t)) + - PbString::ShmStructSize(ModelName()) + - PbString::ShmStructSize(RequestId()) + - PbString::ShmStructSize(Parameters())); + (Inputs().size() * sizeof(bi::managed_external_buffer::handle_t))); infer_request_shm_ptr_ = reinterpret_cast(infer_request_shm.data_.get()); - infer_request_shm_ptr_->correlation_id = CorrelationId(); infer_request_shm_ptr_->input_count = Inputs().size(); infer_request_shm_ptr_->model_version = model_version_; infer_request_shm_ptr_->requested_output_count = @@ -190,6 +209,7 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) infer_request_shm_ptr_->is_decoupled = is_decoupled_; infer_request_shm_ptr_->timeout = timeout_; infer_request_shm_ptr_->preferred_memory = preferred_memory_; + infer_request_shm_ptr_->request_release_flags = request_release_flags_; output_names_handle_shm_ptr_ = reinterpret_cast( @@ -220,30 +240,24 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) i++; } - size_t model_name_offset = - sizeof(InferRequestShm) + - (RequestedOutputNames().size() * - sizeof(bi::managed_external_buffer::handle_t)) + - (Inputs().size() * sizeof(bi::managed_external_buffer::handle_t)); - - std::unique_ptr model_name_shm = PbString::Create( - ModelName(), - reinterpret_cast(infer_request_shm_ptr_) + model_name_offset, - infer_request_shm.handle_ + model_name_offset); - - size_t request_id_offset = - model_name_offset + PbString::ShmStructSize(ModelName()); - std::unique_ptr request_id_shm = PbString::Create( - RequestId(), - reinterpret_cast(infer_request_shm_ptr_) + request_id_offset, - infer_request_shm.handle_ + request_id_offset); - - size_t parameters_offset = - request_id_offset + PbString::ShmStructSize(RequestId()); - std::unique_ptr parameters_shm = PbString::Create( - Parameters(), - reinterpret_cast(infer_request_shm_ptr_) + parameters_offset, - infer_request_shm.handle_ + parameters_offset); + correlation_id_.SaveToSharedMemory(shm_pool); + infer_request_shm_ptr_->correlation_id_shm_handle = + correlation_id_.ShmHandle(); + + std::unique_ptr model_name_shm = + PbString::Create(shm_pool, ModelName()); + infer_request_shm_ptr_->model_name_shm_handle = model_name_shm->ShmHandle(); + + std::unique_ptr request_id_shm = + PbString::Create(shm_pool, RequestId()); + infer_request_shm_ptr_->request_id_shm_handle = request_id_shm->ShmHandle(); + + std::unique_ptr parameters_shm = + PbString::Create(shm_pool, Parameters()); + infer_request_shm_ptr_->parameters_shm_handle = parameters_shm->ShmHandle(); + + trace_.SaveToSharedMemory(shm_pool); + infer_request_shm_ptr_->trace_shm_handle = trace_.ShmHandle(); // Save the references to shared memory. infer_request_shm_ = std::move(infer_request_shm); @@ -257,7 +271,8 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) std::unique_ptr InferRequest::LoadFromSharedMemory( std::unique_ptr& shm_pool, - bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle) + bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle, + bool const* is_model_decoupled) { AllocatedSharedMemory infer_request_shm = shm_pool->Load(request_handle); @@ -295,38 +310,37 @@ InferRequest::LoadFromSharedMemory( input_tensors.emplace_back(std::move(input_tensor)); } - size_t model_name_offset = - sizeof(InferRequestShm) + - (requested_output_count * sizeof(bi::managed_external_buffer::handle_t)) + - (infer_request_shm_ptr->input_count * - sizeof(bi::managed_external_buffer::handle_t)); + std::unique_ptr correlation_id_shm = + CorrelationId::LoadFromSharedMemory( + shm_pool, infer_request_shm_ptr->correlation_id_shm_handle); - std::unique_ptr model_name_shm = PbString::LoadFromSharedMemory( - request_handle + model_name_offset, - reinterpret_cast(infer_request_shm_ptr) + model_name_offset); + std::unique_ptr infer_trace_shm = + InferenceTrace::LoadFromSharedMemory( + shm_pool, infer_request_shm_ptr->trace_shm_handle); - size_t request_id_offset = model_name_offset + model_name_shm->Size(); + std::unique_ptr model_name_shm = PbString::LoadFromSharedMemory( + shm_pool, infer_request_shm_ptr->model_name_shm_handle); std::unique_ptr request_id_shm = PbString::LoadFromSharedMemory( - request_handle + request_id_offset, - reinterpret_cast(infer_request_shm_ptr) + request_id_offset); - - size_t parameters_offset = request_id_offset + request_id_shm->Size(); + shm_pool, infer_request_shm_ptr->request_id_shm_handle); std::unique_ptr parameters_shm = PbString::LoadFromSharedMemory( - request_handle + request_id_offset, - reinterpret_cast(infer_request_shm_ptr) + parameters_offset); + shm_pool, infer_request_shm_ptr->parameters_shm_handle); return std::unique_ptr(new InferRequest( - infer_request_shm, request_id_shm, requested_output_names_shm, - model_name_shm, input_tensors, parameters_shm)); + infer_request_shm, request_id_shm, correlation_id_shm, + requested_output_names_shm, model_name_shm, input_tensors, parameters_shm, + infer_trace_shm, is_model_decoupled)); } InferRequest::InferRequest( AllocatedSharedMemory& infer_request_shm, std::unique_ptr& request_id_shm, + std::unique_ptr& correlation_id_shm, std::vector>& requested_output_names_shm, std::unique_ptr& model_name_shm, std::vector>& input_tensors, - std::unique_ptr& parameters_shm) + std::unique_ptr& parameters_shm, + std::unique_ptr& infer_trace_shm, + bool const* is_model_decoupled) : infer_request_shm_(std::move(infer_request_shm)), request_id_shm_(std::move(request_id_shm)), requested_output_names_shm_(std::move(requested_output_names_shm)), @@ -355,57 +369,54 @@ InferRequest::InferRequest( requested_output_names.emplace(pb_string->String()); } + correlation_id_ = CorrelationId(correlation_id_shm); request_id_ = request_id_shm_->String(); parameters_ = parameters_shm_->String(); requested_output_names_ = std::move(requested_output_names); model_name_ = model_name_shm_->String(); flags_ = infer_request_shm_ptr_->flags; model_version_ = infer_request_shm_ptr_->model_version; - correlation_id_ = infer_request_shm_ptr_->correlation_id; request_address_ = infer_request_shm_ptr_->address; response_factory_address_ = infer_request_shm_ptr_->response_factory_address; is_decoupled_ = infer_request_shm_ptr_->is_decoupled; timeout_ = infer_request_shm_ptr_->timeout; preferred_memory_ = infer_request_shm_ptr_->preferred_memory; + trace_ = InferenceTrace(infer_trace_shm); + request_release_flags_ = infer_request_shm_ptr_->request_release_flags; #ifdef TRITON_PB_STUB + pb_cancel_ = + std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( - request_address_, response_factory_address_, - Stub::GetOrCreateInstance()->SharedMemory()); + request_address_, response_factory_address_, is_model_decoupled, + RequestedOutputNames(), Stub::GetOrCreateInstance()->SharedMemory(), + pb_cancel_); #endif } -#ifndef TRITON_PB_STUB -TRITONSERVER_Error* -InferRequest::DeleteResponseFactory() +#ifdef TRITON_PB_STUB +bool +InferRequest::IsCancelled() { - TRITONBACKEND_ResponseFactory* response_factory = - reinterpret_cast( - response_factory_address_); - TRITONSERVER_Error* error = - TRITONBACKEND_ResponseFactoryDelete(response_factory); - - return error; + return pb_cancel_->IsCancelled(); } -#endif -#ifdef TRITON_PB_STUB std::shared_ptr InferRequest::GetResponseSender() { - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - if (!stub->IsDecoupled()) { - throw PythonBackendException( - "'get_response_sender' function must be called only when the model is " - "using the decoupled transaction policy."); - } - return response_sender_; } std::shared_ptr InferRequest::Exec(const bool is_decoupled) { + // Release the GIL. This avoids a potential deadlock situation in the parent + // process, where every thread in the thread pool is indirectly waiting for a + // function in the stub process that acquires the GIL. Meanwhile, the current + // thread, which holds the GIL, is also waiting for the parent side to have + // the next available thread to pick up the job during resource contention. + py::gil_scoped_release release; + // BLS should not be used in "initialize" or "finalize" function. std::unique_ptr& stub = Stub::GetOrCreateInstance(); if (!stub->IsInitialized() || stub->IsFinalizing()) { @@ -429,7 +440,6 @@ InferRequest::Exec(const bool is_decoupled) }); try { - py::gil_scoped_release release; ipc_message = IPCMessage::Create(shm_pool, true /* inline_response */); bool has_exception = false; PythonBackendException pb_exception(std::string{}); @@ -474,18 +484,26 @@ InferRequest::Exec(const bool is_decoupled) { bi::scoped_lock lock{ *(ipc_message->ResponseMutex())}; - stub->SendIPCMessage(ipc_message); + stub->SendIPCUtilsMessage(ipc_message); ipc_message->ResponseCondition()->wait(lock); } // Additional round trip required for asking the stub process // to fill in the GPU tensor buffers if (has_gpu_tensor) { + AllocatedSharedMemory gpu_buffers_shm = + shm_pool->Load( + request_batch_shm_ptr->gpu_buffers_handle); AllocatedSharedMemory gpu_buffers_handle = shm_pool->Load( - request_batch_shm_ptr->gpu_buffers_handle); + gpu_buffers_shm.data_->buffers); try { + if (!gpu_buffers_shm.data_->success) { + std::unique_ptr error = PbString::LoadFromSharedMemory( + shm_pool, gpu_buffers_shm.data_->error); + throw PythonBackendException(error->String()); + } #ifdef TRITON_ENABLE_GPU size_t i = 0; for (auto& input_tensor : this->Inputs()) { @@ -570,7 +588,7 @@ InferRequest::Exec(const bool is_decoupled) if (!output_tensor->IsCPU()) { uint64_t memory_release_id = output_tensor->Memory()->MemoryReleaseId(); output_tensor->Memory()->SetMemoryReleaseCallback( - [&memory_manager_message_queue, memory_release_id]() { + [&memory_manager_message_queue, memory_release_id, &shm_pool]() { memory_manager_message_queue->Push(memory_release_id); }); } diff --git a/src/infer_request.h b/src/infer_request.h index 96b65dc0..f368d692 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -28,11 +28,15 @@ #include #include + +#include "correlation_id.h" #include "infer_response.h" +#include "infer_trace.h" #include "pb_preferred_memory.h" #include "pb_tensor.h" #ifdef TRITON_PB_STUB +#include "pb_cancel.h" #include "response_sender.h" #endif @@ -44,7 +48,6 @@ class Stub; // Inference Request // struct InferRequestShm { - uint64_t correlation_id; uint32_t input_count; uint32_t requested_output_count; int64_t model_version; @@ -52,41 +55,53 @@ struct InferRequestShm { intptr_t address; intptr_t response_factory_address; bool is_decoupled; - int32_t timeout; + uint64_t timeout; PreferredMemory preferred_memory; + bi::managed_external_buffer::handle_t trace_shm_handle; + uint32_t request_release_flags; + bi::managed_external_buffer::handle_t correlation_id_shm_handle; + bi::managed_external_buffer::handle_t model_name_shm_handle; + bi::managed_external_buffer::handle_t request_id_shm_handle; + bi::managed_external_buffer::handle_t parameters_shm_handle; }; class InferRequest { public: InferRequest( - const std::string& request_id, uint64_t correlation_id, + const std::string& request_id, const CorrelationId& correlation_id, const std::vector>& inputs, const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, const std::string& parameters, const uint32_t flags = 0, - const int32_t timeout = 0, const intptr_t response_factory_address = 0, + const uint64_t timeout = 0, const intptr_t response_factory_address = 0, const intptr_t request_address = 0, const PreferredMemory& preferred_memory = - PreferredMemory(PreferredMemory::DEFAULT, 0)); + PreferredMemory(PreferredMemory::kDefault, 0), + const InferenceTrace& trace = InferenceTrace()); const std::vector>& Inputs(); const std::string& RequestId(); const std::string& Parameters(); - uint64_t CorrelationId(); + CorrelationId& GetCorrelationId(); const std::string& ModelName(); int64_t ModelVersion(); uint32_t Flags(); void SetFlags(uint32_t flags); const std::set& RequestedOutputNames(); bi::managed_external_buffer::handle_t ShmHandle(); - int32_t Timeout(); + uint64_t Timeout(); bool IsDecoupled(); void SetIsDecoupled(const bool is_decoupled); PreferredMemory& GetPreferredMemory(); + InferenceTrace& GetTrace(); + uint32_t ReleaseFlags(); + void SetReleaseFlags(const uint32_t& flags); + intptr_t GetResponseFactoryAddress() { return response_factory_address_; } #ifdef TRITON_PB_STUB std::shared_ptr Exec(const bool is_decoupled); std::shared_ptr GetResponseSender(); + bool IsCancelled(); #endif /// Save an Inference Request to shared memory. @@ -104,7 +119,7 @@ class InferRequest { static std::unique_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t request_handle, - bool open_cuda_handle); + bool open_cuda_handle, bool const* is_model_decoupled); /// Disallow copying the inference request object. DISALLOW_COPY_AND_ASSIGN(InferRequest); @@ -112,32 +127,33 @@ class InferRequest { intptr_t RequestAddress(); ~InferRequest() {} -#ifndef TRITON_PB_STUB - TRITONSERVER_Error* DeleteResponseFactory(); -#endif - private: InferRequest( AllocatedSharedMemory& infer_request_shm, std::unique_ptr& request_id_shm, + std::unique_ptr& correlation_id, std::vector>& requested_output_names_shm, std::unique_ptr& model_name_shm, std::vector>& input_tensors, - std::unique_ptr& parameters_shm); + std::unique_ptr& parameters_shm, + std::unique_ptr& infer_trace_shm, + bool const* is_model_decoupled); std::string request_id_; - uint64_t correlation_id_; + CorrelationId correlation_id_; std::vector> inputs_; std::set requested_output_names_; std::string model_name_; int64_t model_version_; std::string parameters_; uint32_t flags_; - int32_t timeout_; + uint64_t timeout_; intptr_t response_factory_address_; intptr_t request_address_; bool is_decoupled_; PreferredMemory preferred_memory_; + InferenceTrace trace_; + uint32_t request_release_flags_; // Shared Memory Data Structures AllocatedSharedMemory infer_request_shm_; @@ -152,6 +168,7 @@ class InferRequest { std::unique_ptr parameters_shm_; #ifdef TRITON_PB_STUB + std::shared_ptr pb_cancel_; std::shared_ptr response_sender_; #endif }; diff --git a/src/infer_response.cc b/src/infer_response.cc index 4defd74b..382756d4 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -31,6 +31,7 @@ namespace py = pybind11; #endif #include + #include "scoped_defer.h" @@ -38,8 +39,10 @@ namespace triton { namespace backend { namespace python { InferResponse::InferResponse( const std::vector>& output_tensors, - std::shared_ptr error, const bool is_last_response, void* id) - : error_(error), is_last_response_(is_last_response), id_(id) + std::shared_ptr error, std::string parameters, + const bool is_last_response, void* id) + : error_(error), is_last_response_(is_last_response), id_(id), + parameters_(std::move(parameters)) { for (auto& output : output_tensors) { if (!output) { @@ -57,6 +60,12 @@ InferResponse::OutputTensors() return output_tensors_; } +const std::string& +InferResponse::Parameters() const +{ + return parameters_; +} + bool InferResponse::HasError() { @@ -82,6 +91,7 @@ InferResponse::SaveToSharedMemory( response_shm_ptr->is_error_set = false; shm_handle_ = response_shm_.handle_; response_shm_ptr->is_last_response = is_last_response_; + response_shm_ptr->id = id_; // Only save the output tensors to shared memory when the inference response // doesn't have error. @@ -104,7 +114,9 @@ InferResponse::SaveToSharedMemory( tensor_handle_shm_ptr[j] = output_tensor->ShmHandle(); j++; } - response_shm_ptr->id = id_; + + parameters_shm_ = PbString::Create(shm_pool, parameters_); + response_shm_ptr->parameters = parameters_shm_->ShmHandle(); } } @@ -142,6 +154,8 @@ InferResponse::LoadFromSharedMemory( std::shared_ptr pb_error; std::vector> output_tensors; + std::shared_ptr parameters_shm; + std::string parameters; // If the error field is set, do not load output tensors from shared memory. if (response_shm_ptr->has_error && response_shm_ptr->is_error_set) { @@ -153,26 +167,35 @@ InferResponse::LoadFromSharedMemory( bi::managed_external_buffer::handle_t* tensor_handle_shm = reinterpret_cast( response_shm.data_.get() + sizeof(ResponseShm)); + { #ifdef TRITON_PB_STUB - // Need to acquire the GIL to avoid hangs. - py::gil_scoped_acquire acquire; + // PbTensor::LoadFromSharedMemory() will construct Python objects if + // called from pb_stub, which requires holding the GIL. + py::gil_scoped_acquire acquire; #endif - for (size_t idx = 0; idx < requested_output_count; ++idx) { - std::shared_ptr pb_tensor = PbTensor::LoadFromSharedMemory( - shm_pool, tensor_handle_shm[idx], open_cuda_handle); - output_tensors.emplace_back(std::move(pb_tensor)); + for (size_t idx = 0; idx < requested_output_count; ++idx) { + std::shared_ptr pb_tensor = PbTensor::LoadFromSharedMemory( + shm_pool, tensor_handle_shm[idx], open_cuda_handle); + output_tensors.emplace_back(std::move(pb_tensor)); + } } + + parameters_shm = std::move( + PbString::LoadFromSharedMemory(shm_pool, response_shm_ptr->parameters)); + parameters = parameters_shm->String(); } return std::unique_ptr(new InferResponse( response_shm, output_tensors, pb_error, - response_shm_ptr->is_last_response, response_shm_ptr->id)); + response_shm_ptr->is_last_response, response_shm_ptr->id, parameters_shm, + parameters)); } InferResponse::InferResponse( AllocatedSharedMemory& response_shm, std::vector>& output_tensors, - std::shared_ptr& pb_error, const bool is_last_response, void* id) + std::shared_ptr& pb_error, const bool is_last_response, void* id, + std::shared_ptr& parameters_shm, std::string& parameters) { response_shm_ = std::move(response_shm); output_tensors_ = std::move(output_tensors); @@ -180,6 +203,8 @@ InferResponse::InferResponse( shm_handle_ = response_shm_.handle_; id_ = id; is_last_response_ = is_last_response; + parameters_shm_ = std::move(parameters_shm); + parameters_ = std::move(parameters); } std::shared_ptr& @@ -201,64 +226,54 @@ InferResponse::IsLastResponse() } #ifndef TRITON_PB_STUB -std::shared_ptr +void InferResponse::Send( - TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream, + TRITONBACKEND_Response* response, void* cuda_stream, bool& requires_deferred_callback, const uint32_t flags, std::unique_ptr& shm_pool, + GPUBuffersHelper& gpu_buffer_helper, std::vector, void*>>& output_buffers, - const std::set& requested_output_names, - TRITONBACKEND_Response* response) + const std::set& requested_output_names) { +#ifdef TRITON_ENABLE_GPU + static bool log_warning = true; +#endif // TRITON_ENABLE_GPU + std::shared_ptr response_error = WrapTritonErrorInSharedPtr(nullptr); std::unique_ptr response_error_handling; requires_deferred_callback = false; - // Should only destruct the response factory whenever a response factory is - // being created. - bool destruct_response_factor = (response == nullptr); - - if (response == nullptr) { - SET_ERROR_AND_RETURN( - response_error, - TRITONBACKEND_ResponseNewFromFactory(&response, response_factory)); - } - // This lambda expression will be called when this function exits, if the // inference response doesn't have any GPU tensors. Otherwise, it will be // called when the object is destructed or DeferredSendCallback is called. - response_error_handling = std::make_unique( - [response, response_error, flags, response_factory, - destruct_response_factor] { + response_error_handling = + std::make_unique([response, response_error, flags] { if (response != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend(response, flags, *response_error), "failed to send the response."); - if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL && - destruct_response_factor) { - std::unique_ptr< - TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> - response_factory_ptr( - reinterpret_cast( - response_factory)); - } } }); // Moves the response sending callback so that it is not called until the stub // process fills in the GPU buffers. - ScopedDefer deferred_task( - [this, &requires_deferred_callback, &response_error_handling] { - if (requires_deferred_callback) { - deferred_send_callback_ = std::move(response_error_handling); - } - }); + ScopedDefer deferred_task([this, &requires_deferred_callback, + &response_error_handling, &gpu_buffer_helper, + response_error, &shm_pool] { + if (*response_error != nullptr) { + gpu_buffer_helper.SetError( + shm_pool, TRITONSERVER_ErrorMessage(*response_error)); + } + if (requires_deferred_callback) { + deferred_send_callback_ = std::move(response_error_handling); + } + }); if (HasError()) { - *response_error = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, Error()->Message().c_str()); - return nullptr; + *response_error = + TRITONSERVER_ErrorNew(Error()->Code(), Error()->Message().c_str()); + return; } bool cuda_copy = false; @@ -283,11 +298,12 @@ InferResponse::Send( static_cast(output_tensor->TritonDtype()), output_tensor->Dims().data(), output_tensor->Dims().size())); - void* buffer; + void* triton_output_buffer; SET_ERROR_AND_RETURN( - response_error, TRITONBACKEND_OutputBuffer( - response_output, &buffer, output_tensor->ByteSize(), - &actual_memory_type, &actual_memory_type_id)); + response_error, + TRITONBACKEND_OutputBuffer( + response_output, &triton_output_buffer, output_tensor->ByteSize(), + &actual_memory_type, &actual_memory_type_id)); bool cuda_used = false; TRITONSERVER_BufferAttributes* output_buffer_attributes; @@ -299,6 +315,40 @@ InferResponse::Send( if (src_memory_type == TRITONSERVER_MEMORY_GPU && actual_memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU + // Check if the triton-provided output buffer is using CUDA shared memory + // pool. If not, try to allocate a new buffer from the pool. + void* buffer = triton_output_buffer; + BackendMemory* backend_memory; + std::unique_ptr lbackend_memory; + std::unique_ptr& cuda_pool = + shm_pool->GetCUDAMemoryPoolManager(); + if (cuda_pool->UseCudaSharedPool(src_memory_type_id)) { + try { + if (!IsUsingCUDAPool( + cuda_pool, actual_memory_type_id, triton_output_buffer)) { + THROW_IF_TRITON_ERROR(BackendMemory::Create( + reinterpret_cast( + shm_pool->GetCUDAMemoryPoolManager() + ->TritonMemoryManager()), + BackendMemory::AllocationType::GPU_POOL, actual_memory_type_id, + output_tensor->ByteSize(), &backend_memory)); + lbackend_memory.reset(backend_memory); + buffer = lbackend_memory->MemoryPtr(); + } + } + catch (const PythonBackendException& pb_exception) { + if (log_warning) { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("Failed to allocate memory from CUDA memory pool " + "for output tensor: ") + + pb_exception.what() + + std::string(", will use CUDA IPC for GPU output transfer.")) + .c_str()); + } + log_warning = false; + } + } cudaIpcMemHandle_t* cuda_ipc_mem_handle_p; SET_ERROR_AND_RETURN( response_error, @@ -322,7 +372,13 @@ InferResponse::Send( output_tensor->ByteSize(), reinterpret_cast(buffer), true /* copy_gpu */)); } - output_buffers.push_back({std::move(output_buffer), buffer}); + + if (lbackend_memory != nullptr) { + output_buffer->SetBackendMemory(std::move(lbackend_memory)); + } + gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle()); + output_buffers.push_back( + {std::move(output_buffer), triton_output_buffer}); #endif } @@ -336,7 +392,9 @@ InferResponse::Send( shm_pool, actual_memory_type, actual_memory_type_id, output_tensor->ByteSize(), nullptr /* data ptr */)); - output_buffers.push_back({std::move(output_buffer), buffer}); + gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle()); + output_buffers.push_back( + {std::move(output_buffer), triton_output_buffer}); } if (src_memory_type != TRITONSERVER_MEMORY_GPU) { @@ -345,20 +403,51 @@ InferResponse::Send( CopyBuffer( "Failed to copy the output tensor to buffer.", src_memory_type, src_memory_type_id, actual_memory_type, actual_memory_type_id, - output_tensor->ByteSize(), output_tensor->DataPtr(), buffer, - reinterpret_cast(cuda_stream), &cuda_used)); + output_tensor->ByteSize(), output_tensor->DataPtr(), + triton_output_buffer, reinterpret_cast(cuda_stream), + &cuda_used)); } cuda_copy |= cuda_used; } + if (!parameters_.empty()) { + triton::common::TritonJson::Value param; + THROW_IF_TRITON_ERROR( + param.Parse(parameters_.c_str(), parameters_.length())); + std::vector param_keys; + THROW_IF_TRITON_ERROR(param.Members(¶m_keys)); + for (const auto& key : param_keys) { + triton::common::TritonJson::Value value; + if (!param.Find(key.c_str(), &value)) { + throw PythonBackendException("Unexpected missing key on parameters"); + } + if (value.IsString()) { + std::string string_value; + THROW_IF_TRITON_ERROR(value.AsString(&string_value)); + THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetStringParameter( + response, key.c_str(), string_value.c_str())); + } else if (value.IsInt()) { + int64_t int_value = 0; + THROW_IF_TRITON_ERROR(value.AsInt(&int_value)); + THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetIntParameter( + response, key.c_str(), int_value)); + } else if (value.IsBool()) { + bool bool_value = false; + THROW_IF_TRITON_ERROR(value.AsBool(&bool_value)); + THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetBoolParameter( + response, key.c_str(), bool_value)); + } else { + throw PythonBackendException("Unsupported value type on parameters"); + } + } + } + #ifdef TRITON_ENABLE_GPU if (cuda_copy) { cudaStreamSynchronize(reinterpret_cast(cuda_stream)); } #endif // TRITON_ENABLE_GPU - - return response_error; } #endif diff --git a/src/infer_response.h b/src/infer_response.h index 9197df4e..ab8eb68a 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,6 +27,8 @@ #pragma once #include + +#include "gpu_buffers.h" #include "pb_error.h" #include "pb_tensor.h" #include "pb_utils.h" @@ -36,6 +38,7 @@ namespace triton { namespace backend { namespace python { struct ResponseShm { uint32_t outputs_size; + bi::managed_external_buffer::handle_t parameters; bi::managed_external_buffer::handle_t error; bool has_error; // Indicates whether this error has a message or not. @@ -49,7 +52,7 @@ struct ResponseShm { TRITONSERVER_Error* raasnie_err__ = (X); \ if (raasnie_err__ != nullptr) { \ *E = raasnie_err__; \ - return E; \ + return; \ } \ } while (false) @@ -62,7 +65,7 @@ struct ResponseShm { TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \ TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \ *E = rarie_err__; \ - return E; \ + return; \ } \ } while (false) @@ -70,9 +73,10 @@ class InferResponse { public: InferResponse( const std::vector>& output_tensors, - std::shared_ptr error = nullptr, + std::shared_ptr error = nullptr, std::string parameters = "", const bool is_last_response = true, void* id = nullptr); std::vector>& OutputTensors(); + const std::string& Parameters() const; // JSON serializable unless empty void SaveToSharedMemory( std::unique_ptr& shm_pool, bool copy_gpu = true); static std::unique_ptr LoadFromSharedMemory( @@ -96,13 +100,13 @@ class InferResponse { /// response needs to be done in two step. The boolean /// 'requires_deferred_callback' indicates whether DeferredSendCallback method /// should be called or not. - std::shared_ptr Send( - TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream, + void Send( + TRITONBACKEND_Response* response, void* cuda_stream, bool& requires_deferred_callback, const uint32_t flags, std::unique_ptr& shm_pool, + GPUBuffersHelper& gpu_buffer_helper, std::vector, void*>>& output_buffers, - const std::set& requested_output_names = {}, - TRITONBACKEND_Response* response = nullptr); + const std::set& requested_output_names = {}); void DeferredSendCallback(); #endif @@ -114,8 +118,8 @@ class InferResponse { InferResponse( AllocatedSharedMemory& response_shm, std::vector>& output_tensors, - std::shared_ptr& pb_error, const bool is_last_response, - void* id); + std::shared_ptr& pb_error, const bool is_last_response, void* id, + std::shared_ptr& parameters_shm, std::string& parameters); std::vector> output_tensors_; std::shared_ptr error_; @@ -126,6 +130,9 @@ class InferResponse { bool is_last_response_; // Representing the request id that the response was created from. void* id_; + + std::shared_ptr parameters_shm_; + std::string parameters_; }; }}} // namespace triton::backend::python diff --git a/src/infer_trace.cc b/src/infer_trace.cc new file mode 100644 index 00000000..50645dcc --- /dev/null +++ b/src/infer_trace.cc @@ -0,0 +1,101 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "infer_trace.h" + +namespace triton { namespace backend { namespace python { + +InferenceTrace::InferenceTrace(const InferenceTrace& rhs) +{ + triton_trace_ = rhs.triton_trace_; + trace_context_ = rhs.trace_context_; +} + +InferenceTrace& +InferenceTrace::operator=(const InferenceTrace& rhs) +{ + triton_trace_ = rhs.triton_trace_; + trace_context_ = rhs.trace_context_; + return *this; +} + +InferenceTrace::InferenceTrace(std::unique_ptr& trace_shm) +{ + triton_trace_ = trace_shm->triton_trace_; + trace_context_ = trace_shm->trace_context_; +} + +void +InferenceTrace::SaveToSharedMemory( + std::unique_ptr& shm_pool) +{ + AllocatedSharedMemory infer_trace_shm = + shm_pool->Construct(); + infer_trace_shm_ptr_ = infer_trace_shm.data_.get(); + + infer_trace_shm_ptr_->triton_trace = triton_trace_; + + std::unique_ptr trace_context_shm = + PbString::Create(shm_pool, trace_context_); + + infer_trace_shm_ptr_->trace_context_shm_handle = + trace_context_shm->ShmHandle(); + + // Save the references to shared memory. + trace_context_shm_ = std::move(trace_context_shm); + infer_trace_shm_ = std::move(infer_trace_shm); + shm_handle_ = infer_trace_shm_.handle_; +} + +std::unique_ptr +InferenceTrace::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory infer_trace_shm = + shm_pool->Load(handle); + InferenceTraceShm* infer_trace_shm_ptr = infer_trace_shm.data_.get(); + + std::unique_ptr trace_context_shm = PbString::LoadFromSharedMemory( + shm_pool, infer_trace_shm_ptr->trace_context_shm_handle); + + return std::unique_ptr( + new InferenceTrace(infer_trace_shm, trace_context_shm)); +} + +InferenceTrace::InferenceTrace( + AllocatedSharedMemory& infer_trace_shm, + std::unique_ptr& trace_context_shm) + : infer_trace_shm_(std::move(infer_trace_shm)), + trace_context_shm_(std::move(trace_context_shm)) +{ + infer_trace_shm_ptr_ = infer_trace_shm_.data_.get(); + shm_handle_ = infer_trace_shm_.handle_; + triton_trace_ = infer_trace_shm_ptr_->triton_trace; + trace_context_ = trace_context_shm_->String(); +} + +}}}; // namespace triton::backend::python diff --git a/src/infer_trace.h b/src/infer_trace.h new file mode 100644 index 00000000..aac9137f --- /dev/null +++ b/src/infer_trace.h @@ -0,0 +1,90 @@ +// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include "pb_string.h" +#include "pb_utils.h" + +namespace triton { namespace backend { namespace python { + +struct InferenceTraceShm { + bi::managed_external_buffer::handle_t trace_context_shm_handle; + // The address of the 'TRITONSERVER_InferTrace' object. + void* triton_trace; +}; + +// +// Inference Trace +// +class InferenceTrace { + public: + InferenceTrace(void* triton_trace, const std::string& ctxt) + : triton_trace_(triton_trace), trace_context_(ctxt) + { + } + InferenceTrace() : triton_trace_(nullptr), trace_context_("") {} + InferenceTrace(const InferenceTrace& rhs); + InferenceTrace(std::unique_ptr& trace_shm); + InferenceTrace& operator=(const InferenceTrace& rhs); + /// Save InferenceTrace object to shared memory. + /// \param shm_pool Shared memory pool to save the InferenceTrace object. + void SaveToSharedMemory(std::unique_ptr& shm_pool); + + /// Create a InferenceTrace object from shared memory. + /// \param shm_pool Shared memory pool + /// \param handle Shared memory handle of the InferenceTrace. + /// \return Returns the InferenceTrace in the specified handle + /// location. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); + + void* TritonTrace() { return triton_trace_; } + const std::string& Context() const { return trace_context_; } + + bi::managed_external_buffer::handle_t ShmHandle() { return shm_handle_; } + + private: + // The private constructor for creating a InferenceTrace object from shared + // memory. + InferenceTrace( + AllocatedSharedMemory& infer_trace_shm, + std::unique_ptr& trace_context_shm); + + void* triton_trace_; + std::string trace_context_; + + // Shared Memory Data Structures + AllocatedSharedMemory infer_trace_shm_; + InferenceTraceShm* infer_trace_shm_ptr_; + bi::managed_external_buffer::handle_t shm_handle_; + std::unique_ptr trace_context_shm_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/ipc_message.cc b/src/ipc_message.cc index ea1dc5b0..2fa13ba3 100644 --- a/src/ipc_message.cc +++ b/src/ipc_message.cc @@ -56,6 +56,21 @@ IPCMessage::Create( new IPCMessage(ipc_message_shm, response_mutex_shm, response_cond_shm)); } +std::unique_ptr +IPCMessage::Create( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& message_handle) +{ + return std::unique_ptr( + new IPCMessage(ipc_message_shm, message_handle)); +} + +AllocatedSharedMemory& +IPCMessage::GetAllocatedSharedMemory() +{ + return ipc_message_shm_; +} + std::unique_ptr IPCMessage::LoadFromSharedMemory( std::unique_ptr& shm_pool, @@ -133,4 +148,12 @@ IPCMessage::IPCMessage( ipc_message_handle_ = ipc_message_shm_.handle_; } +IPCMessage::IPCMessage( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& handle) +{ + ipc_message_handle_ = handle; + ipc_message_shm_ptr_ = ipc_message_shm; +} + }}}; // namespace triton::backend::python diff --git a/src/ipc_message.h b/src/ipc_message.h index 4ec15290..c0fab3a3 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -28,6 +28,7 @@ #include #include + #include "shm_manager.h" @@ -40,18 +41,34 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_ExecuteResponse, PYTHONSTUB_InitializeRequest, PYTHONSTUB_InitializeResponse, + PYTHONSTUB_CUDAPoolInitializeRequest, PYTHONSTUB_FinalizeRequest, PYTHONSTUB_FinalizeResponse, PYTHONSTUB_LoadGPUBuffers, PYTHONSTUB_InferExecRequest, PYTHONSTUB_InferStreamExecRequest, PYTHONSTUB_InferExecResponse, + PYTHONSTUB_InferStreamExecResponse, PYTHONSTUB_ResponseSend, PYTHONSTUB_ResponseClose, PYTHONSTUB_AutoCompleteRequest, PYTHONSTUB_AutoCompleteResponse, PYTHONSTUB_LogRequest, - PYTHONSTUB_CleanupRequest + PYTHONSTUB_BLSDecoupledInferPayloadCleanup, + PYTHONSTUB_DecoupledResponseFactoryCleanup, + PYTHONSTUB_MetricFamilyRequestNew, + PYTHONSTUB_MetricFamilyRequestDelete, + PYTHONSTUB_MetricRequestNew, + PYTHONSTUB_MetricRequestDelete, + PYTHONSTUB_MetricRequestValue, + PYTHONSTUB_MetricRequestIncrement, + PYTHONSTUB_MetricRequestSet, + PYTHONSTUB_MetricRequestObserve, + PYTHONSTUB_LoadModelRequest, + PYTHONSTUB_UnloadModelRequest, + PYTHONSTUB_ModelReadinessRequest, + PYTHONSTUB_IsRequestCancelled, + PYTHONSTUB_CancelBLSInferRequest } PYTHONSTUB_CommandType; /// @@ -81,6 +98,10 @@ class IPCMessage { static std::unique_ptr Create( const std::unique_ptr& shm_pool, bool inline_response); + + static std::unique_ptr Create( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& message_handle); static std::unique_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t message_handle); @@ -92,6 +113,7 @@ class IPCMessage { bi::interprocess_mutex* ResponseMutex(); bi::managed_external_buffer::handle_t& Args(); bi::managed_external_buffer::handle_t ShmHandle(); + AllocatedSharedMemory& GetAllocatedSharedMemory(); private: AllocatedSharedMemory ipc_message_shm_; @@ -113,6 +135,10 @@ class IPCMessage { AllocatedSharedMemory& ipc_message_shm, AllocatedSharedMemory& response_mutex_shm, AllocatedSharedMemory& response_cond_shm); + + IPCMessage( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& handle); }; }}}; // namespace triton::backend::python diff --git a/src/memory_manager.cc b/src/memory_manager.cc index 54bdfe39..716dee9e 100644 --- a/src/memory_manager.cc +++ b/src/memory_manager.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "memory_manager.h" + #include "pb_utils.h" @@ -32,29 +33,23 @@ namespace triton { namespace backend { namespace python { #ifdef TRITON_ENABLE_GPU -GPUMemoryRecord::GPUMemoryRecord(void* ptr) +BackendMemoryRecord::BackendMemoryRecord( + std::unique_ptr backend_memory) + : backend_memory_(std::move(backend_memory)) { - ptr_ = ptr; release_callback_ = [](void* ptr) { - cudaError_t err = cudaFree(ptr); - if (err != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("Failed to free the allocated cuda memory. error: ") + - cudaGetErrorString(err)) - .c_str()); - } + // Do nothing. The backend_memory_ will be destroyed in the destructor. }; } void* -GPUMemoryRecord::MemoryId() +BackendMemoryRecord::MemoryId() { - return ptr_; + return reinterpret_cast(backend_memory_->MemoryPtr()); } const std::function& -GPUMemoryRecord::ReleaseCallback() +BackendMemoryRecord::ReleaseCallback() { return release_callback_; } @@ -100,6 +95,7 @@ MemoryManager::QueueMonitorThread() // Call the release callback. it->second->ReleaseCallback()(it->second->MemoryId()); + // it->second.reset(); records_.erase(it); } } diff --git a/src/memory_manager.h b/src/memory_manager.h index 7930d0e8..5b7e35f5 100644 --- a/src/memory_manager.h +++ b/src/memory_manager.h @@ -1,4 +1,4 @@ -// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -30,8 +30,10 @@ #include #include #include + #include "message_queue.h" #include "triton/backend/backend_common.h" +#include "triton/backend/backend_memory.h" #include "triton/core/tritonserver.h" #ifdef TRITON_ENABLE_GPU @@ -45,17 +47,19 @@ class MemoryRecord { public: virtual const std::function& ReleaseCallback() = 0; virtual void* MemoryId() = 0; + virtual ~MemoryRecord() = default; }; #ifdef TRITON_ENABLE_GPU -class GPUMemoryRecord : public MemoryRecord { +class BackendMemoryRecord : public MemoryRecord { public: - GPUMemoryRecord(void* ptr); + BackendMemoryRecord(std::unique_ptr backend_memory); const std::function& ReleaseCallback() override; void* MemoryId() override; + ~BackendMemoryRecord() { backend_memory_.reset(); } private: - void* ptr_; + std::unique_ptr backend_memory_; std::function release_callback_; }; #endif diff --git a/src/message_queue.h b/src/message_queue.h index bb87e04a..06661c66 100644 --- a/src/message_queue.h +++ b/src/message_queue.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -31,14 +31,20 @@ #include #include #include + +#include "pb_utils.h" #include "shm_manager.h" +#ifdef TRITON_PB_STUB +#include "pb_stub_log.h" +#endif namespace triton { namespace backend { namespace python { namespace bi = boost::interprocess; -/// Struct holding the represenation of a message queue inside the shared +/// Struct holding the representation of a message queue inside the shared /// memory. -/// \param size Total size of the message queue. +/// \param size Total size of the message queue. Considered invalid after +/// MessageQueue::LoadFromSharedMemory. Check DLIS-8378 for additional details. /// \param mutex Handle of the mutex variable protecting index. /// \param index Used element index. /// \param sem_empty Semaphore object counting the number of empty buffer slots. @@ -109,7 +115,22 @@ class MessageQueue { { bi::scoped_lock lock{*MutexMutable()}; - Buffer()[Head()] = message; + int head_idx = Head(); + // Additional check to avoid out of bounds read/write. Check DLIS-8378 for + // additional details. + if (head_idx < 0 || static_cast(head_idx) >= Size()) { + std::string error_msg = + "internal error: message queue head index out of bounds. Expects " + "positive integer less than the size of message queue " + + std::to_string(Size()) + " but got " + std::to_string(head_idx); +#ifdef TRITON_PB_STUB + LOG_ERROR << error_msg; +#else + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, error_msg.c_str()); +#endif + return; + } + Buffer()[head_idx] = message; HeadIncrement(); } SemFullMutable()->post(); @@ -144,7 +165,22 @@ class MessageQueue { } success = true; - Buffer()[Head()] = message; + int head_idx = Head(); + // Additional check to avoid out of bounds read/write. Check DLIS-8378 for + // additional details. + if (head_idx < 0 || static_cast(head_idx) >= Size()) { + std::string error_msg = + "internal error: message queue head index out of bounds. Expects " + "positive integer less than the size of message queue " + + std::to_string(Size()) + " but got " + std::to_string(head_idx); +#ifdef TRITON_PB_STUB + LOG_ERROR << error_msg; +#else + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, error_msg.c_str()); +#endif + return; + } + Buffer()[head_idx] = message; HeadIncrement(); } SemFullMutable()->post(); @@ -243,7 +279,7 @@ class MessageQueue { } private: - std::size_t& Size() { return mq_shm_ptr_->size; } + uint32_t Size() { return size_; } const bi::interprocess_mutex& Mutex() { return mq_shm_ptr_->mutex; } bi::interprocess_mutex* MutexMutable() { return &(mq_shm_ptr_->mutex); } int& Head() { return mq_shm_ptr_->head; } @@ -272,6 +308,7 @@ class MessageQueue { MessageQueueShm* mq_shm_ptr_; T* mq_buffer_shm_ptr_; bi::managed_external_buffer::handle_t mq_handle_; + uint32_t size_; /// Create/load a Message queue. /// \param mq_shm Message queue representation in shared memory. @@ -283,6 +320,7 @@ class MessageQueue { mq_buffer_shm_ptr_ = mq_buffer_shm_.data_.get(); mq_shm_ptr_ = mq_shm_.data_.get(); mq_handle_ = mq_shm_.handle_; + size_ = mq_shm_ptr_->size; } }; }}} // namespace triton::backend::python diff --git a/src/metric.cc b/src/metric.cc new file mode 100644 index 00000000..4c055910 --- /dev/null +++ b/src/metric.cc @@ -0,0 +1,394 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "metric.h" + +#ifdef TRITON_PB_STUB +#include "pb_stub.h" +#endif + +namespace triton { namespace backend { namespace python { + +Metric::Metric( + const std::string& labels, std::optional> buckets, + void* metric_family_address) + : labels_(labels), buckets_(buckets), operation_value_(0), + metric_address_(nullptr), metric_family_address_(metric_family_address), + is_cleared_(false) +{ +#ifdef TRITON_PB_STUB + SendCreateMetricRequest(); +#endif +} + +Metric::~Metric() +{ +#ifdef TRITON_PB_STUB + Clear(); +#endif +} + +void +Metric::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + AllocatedSharedMemory custom_metric_shm = + shm_pool->Construct(); + custom_metric_shm_ptr_ = custom_metric_shm.data_.get(); + + std::unique_ptr labels_shm = PbString::Create(shm_pool, labels_); + + custom_metric_shm_ptr_->operation_value = operation_value_; + custom_metric_shm_ptr_->labels_shm_handle = labels_shm->ShmHandle(); + custom_metric_shm_ptr_->metric_family_address = metric_family_address_; + custom_metric_shm_ptr_->metric_address = metric_address_; + + // Histogram specific case + if (buckets_.has_value()) { + auto buckets_size = buckets_.value().size() * sizeof(double); + std::unique_ptr buckets_shm = PbMemory::Create( + shm_pool, TRITONSERVER_MemoryType::TRITONSERVER_MEMORY_CPU, 0, + buckets_size, reinterpret_cast(buckets_.value().data()), + false /* copy_gpu */); + custom_metric_shm_ptr_->buckets_shm_handle = buckets_shm->ShmHandle(); + buckets_shm_ = std::move(buckets_shm); + } else { + custom_metric_shm_ptr_->buckets_shm_handle = 0; + buckets_shm_ = nullptr; + } + + // Save the references to shared memory. + custom_metric_shm_ = std::move(custom_metric_shm); + labels_shm_ = std::move(labels_shm); + shm_handle_ = custom_metric_shm_.handle_; +} + +std::unique_ptr +Metric::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory custom_metric_shm = + shm_pool->Load(handle); + MetricShm* custom_metric_shm_ptr = custom_metric_shm.data_.get(); + + std::unique_ptr labels_shm = PbString::LoadFromSharedMemory( + shm_pool, custom_metric_shm_ptr->labels_shm_handle); + + std::unique_ptr buckets_shm = nullptr; + if (custom_metric_shm_ptr->buckets_shm_handle != 0) { + buckets_shm = PbMemory::LoadFromSharedMemory( + shm_pool, custom_metric_shm_ptr->buckets_shm_handle, + false /* open_cuda_handle */); + } + + return std::unique_ptr( + new Metric(custom_metric_shm, labels_shm, buckets_shm)); +} + +Metric::Metric( + AllocatedSharedMemory& custom_metric_shm, + std::unique_ptr& labels_shm, + std::unique_ptr& buckets_shm) + : custom_metric_shm_(std::move(custom_metric_shm)), + labels_shm_(std::move(labels_shm)), buckets_shm_(std::move(buckets_shm)) +{ + custom_metric_shm_ptr_ = custom_metric_shm_.data_.get(); + + // FIXME: This constructor is called during each + // set/increment/observe/get_value call. It only needs the pointers. + labels_ = labels_shm_->String(); + if (buckets_shm_ != nullptr) { // Histogram + size_t bucket_size = buckets_shm_->ByteSize() / sizeof(double); + std::vector buckets; + buckets.reserve(bucket_size); + for (size_t i = 0; i < bucket_size; ++i) { + buckets.emplace_back( + reinterpret_cast(buckets_shm_->DataPtr())[i]); + } + buckets_ = std::move(buckets); + } + + operation_value_ = custom_metric_shm_ptr_->operation_value; + metric_family_address_ = custom_metric_shm_ptr_->metric_family_address; + metric_address_ = custom_metric_shm_ptr_->metric_address; +} + +void* +Metric::MetricAddress() +{ + return metric_address_; +} + +#ifdef TRITON_PB_STUB +void +Metric::SendCreateMetricRequest() +{ + // Send the request to create the Metric to the parent process + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; + AllocatedSharedMemory custom_metrics_shm; + try { + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestNew, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Error when creating Metric: " + std::string(pb_exception.what())); + } + + custom_metrics_msg = custom_metrics_shm.data_.get(); + metric_address_ = custom_metrics_msg->address; +} + +void +Metric::SendIncrementRequest(const double& value) +{ + py::gil_scoped_release release; + try { + CheckIfCleared(); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + operation_value_ = value; + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory custom_metrics_shm; + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestIncrement, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to increment metric value: " + + std::string(pb_exception.what())); + } +} + +void +Metric::SendSetValueRequest(const double& value) +{ + try { + CheckIfCleared(); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + operation_value_ = value; + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory custom_metrics_shm; + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestSet, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to set metric value: " + std::string(pb_exception.what())); + } +} + +void +Metric::SendObserveRequest(const double& value) +{ + py::gil_scoped_release release; + try { + CheckIfCleared(); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + operation_value_ = value; + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory custom_metrics_shm; + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestObserve, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to observe metric value: " + std::string(pb_exception.what())); + } +} + +double +Metric::SendGetValueRequest() +{ + CustomMetricsMessage* custom_metrics_msg = nullptr; + AllocatedSharedMemory custom_metrics_shm; + try { + CheckIfCleared(); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestValue, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to get metric value: " + std::string(pb_exception.what())); + } + + custom_metrics_msg = custom_metrics_shm.data_.get(); + return custom_metrics_msg->value; +} + +void +Metric::Clear() +{ + // Need to check if the metric has been cleared before as the Clear()' + // function can be called from two different locations: when the metric family + // clears the 'metric_map_' and when the 'Metric' object goes out of + // scope/being deleted. + if (!is_cleared_) { + is_cleared_ = true; + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory custom_metrics_shm; + try { + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestDelete, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + std::cerr << "Error when deleting Metric: " << pb_exception.what() + << "\n"; + } + } +} + +void +Metric::CheckIfCleared() +{ + if (is_cleared_) { + throw PythonBackendException( + "Invalid metric operation as the corresponding 'MetricFamily' has been " + "deleted. The 'MetricFamily' object should be deleted AFTER its " + "corresponding 'Metric' objects have been deleted."); + } +} + +#else +void* +Metric::InitializeTritonMetric() +{ + std::vector labels_params; + ParseLabels(labels_params, labels_); + TRITONSERVER_MetricKind kind; + THROW_IF_TRITON_ERROR(TRITONSERVER_GetMetricFamilyKind( + reinterpret_cast(metric_family_address_), + &kind)); + TRITONSERVER_MetricArgs* args = nullptr; + switch (kind) { + case TRITONSERVER_METRIC_KIND_COUNTER: + case TRITONSERVER_METRIC_KIND_GAUGE: + break; + case TRITONSERVER_METRIC_KIND_HISTOGRAM: { + const std::vector& buckets = buckets_.value(); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsNew(&args)); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsSetHistogram( + args, buckets.data(), buckets.size())); + break; + } + default: + break; + } + + TRITONSERVER_Metric* triton_metric = nullptr; + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricNewWithArgs( + &triton_metric, + reinterpret_cast(metric_family_address_), + labels_params.data(), labels_params.size(), args)); + for (const auto label : labels_params) { + TRITONSERVER_ParameterDelete(const_cast(label)); + } + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsDelete(args)); + return reinterpret_cast(triton_metric); +} + +void +Metric::ParseLabels( + std::vector& labels_params, + const std::string& labels) +{ + triton::common::TritonJson::Value labels_json; + THROW_IF_TRITON_ERROR(labels_json.Parse(labels)); + + std::vector members; + labels_json.Members(&members); + for (const auto& member : members) { + std::string value; + THROW_IF_TRITON_ERROR(labels_json.MemberAsString(member.c_str(), &value)); + labels_params.emplace_back(TRITONSERVER_ParameterNew( + member.c_str(), TRITONSERVER_PARAMETER_STRING, value.c_str())); + } +} + +void +Metric::HandleMetricOperation( + CustomMetricsMessage* metrics_message_ptr, + const PYTHONSTUB_CommandType& command_type) +{ + if (command_type == PYTHONSTUB_MetricRequestValue) { + metrics_message_ptr->value = GetValue(); + } else if (command_type == PYTHONSTUB_MetricRequestIncrement) { + Increment(operation_value_); + } else if (command_type == PYTHONSTUB_MetricRequestSet) { + SetValue(operation_value_); + } else if (command_type == PYTHONSTUB_MetricRequestObserve) { + Observe(operation_value_); + } else { + throw PythonBackendException("Unknown metric operation"); + } +} + +void +Metric::Increment(const double& value) +{ + auto triton_metric = reinterpret_cast(metric_address_); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricIncrement(triton_metric, value)); +} + +void +Metric::SetValue(const double& value) +{ + auto triton_metric = reinterpret_cast(metric_address_); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricSet(triton_metric, value)); +} + +void +Metric::Observe(const double& value) +{ + auto triton_metric = reinterpret_cast(metric_address_); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricObserve(triton_metric, value)); +} + +double +Metric::GetValue() +{ + double value; + auto triton_metric = reinterpret_cast(metric_address_); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricValue(triton_metric, &value)); + return value; +} + +void +Metric::ClearTritonMetric() +{ + auto triton_metric = reinterpret_cast(metric_address_); + if (triton_metric != nullptr) { + LOG_IF_ERROR(TRITONSERVER_MetricDelete(triton_metric), "deleting metric"); + } +} + +#endif + +}}} // namespace triton::backend::python diff --git a/src/metric.h b/src/metric.h new file mode 100644 index 00000000..cd54ca54 --- /dev/null +++ b/src/metric.h @@ -0,0 +1,193 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include + +#include "ipc_message.h" +#include "pb_memory.h" +#include "pb_string.h" +#include "pb_utils.h" + +#ifdef TRITON_PB_STUB +#include +namespace py = pybind11; +#else +#include "triton/core/tritonserver.h" +#endif + +namespace triton { namespace backend { namespace python { + +// The 'MetricShm' struct is utilized by the 'Metric' class for saving the +// essential data to shared memory and for loading the data from shared memory +// in order to reconstruct the 'Metric' object. +struct MetricShm { + // The shared memory handle of the labels in PbString format. + bi::managed_external_buffer::handle_t labels_shm_handle; + // The shared memory handle of the buckets in PbMemory format. + bi::managed_external_buffer::handle_t buckets_shm_handle; + // The value used for incrementing or setting the metric. + double operation_value; + // The address of the TRITONSERVER_Metric object. + void* metric_address; + // The address corresponds to the TRITONSERVER_MetricFamily object that this + // metric belongs to. + void* metric_family_address; +}; + +class Metric { + public: + Metric( + const std::string& labels, + std::optional> buckets, + void* metric_family_address); + + ~Metric(); + + /// Save Custom Metric object to shared memory. + /// \param shm_pool Shared memory pool to save the custom metric object. + void SaveToSharedMemory(std::unique_ptr& shm_pool); + + /// Create a Custom Metric object from shared memory. + /// \param shm_pool Shared memory pool + /// \param handle Shared memory handle of the custom metric. + /// \return Returns the custom metrics in the specified request_handle + /// location. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); + + /// Get the address of the TRITONSERVER_Metric object. + /// \return Returns the address of the TRITONSERVER_Metric object. + void* MetricAddress(); + + /// Send the request to the parent process to delete the Metric object. + void Clear(); + +#ifdef TRITON_PB_STUB + /// Send a request to register a new 'TRITONSERVER_Metric' object to the + /// parent process. + void SendCreateMetricRequest(); + + /// Send the request to the parent process to increment the metric by the + /// specified value. + /// \param value The value to increment the metric by. + void SendIncrementRequest(const double& value); + + /// Send the request to the parent process to set the metric to the specified + /// value. + /// \param value The value to set the metric to. + void SendSetValueRequest(const double& value); + + /// Send the request to the parent process to observe the value to the metric. + /// \param value The value to set the metric to. + void SendObserveRequest(const double& value); + + /// Send the request to the parent process to get the value of the metric. + /// \return Returns the value of the metric. + double SendGetValueRequest(); + + /// Throws an exception if the metric has been cleared. This check is to avoid + /// the user error where the corresponding metric family has been deleted + /// before the metric is deleted. + void CheckIfCleared(); +#else + // Initialize the TRITONSERVER_Metric object. + /// \return Returns the address of the TRITONSERVER_Metric object. + void* InitializeTritonMetric(); + + /// Parse the labels string into a vector of TRITONSERVER_Parameter. + /// \param labels_params The vector of TRITONSERVER_Parameter to store the + /// parsed labels. + /// \param labels The labels string to parse. + void ParseLabels( + std::vector& labels_params, + const std::string& labels); + + /// Handle the metric operation. + /// \param metrics_message_ptr The pointer to the CustomMetricsMessage object. + void HandleMetricOperation( + CustomMetricsMessage* metrics_message_ptr, + const PYTHONSTUB_CommandType& command_type); + + /// Use Triton C API to increment the value of the metric by the given value. + /// \param value The value to increment the metric by. + void Increment(const double& value); + + /// Use Triton C API to set the value of the metric to the given value. + /// \param value The value to set the metric to. + void SetValue(const double& value); + + /// Use Triton C API to sample the observation to the metric. + /// \param value The value to sample observation to the metric. + void Observe(const double& value); + + /// Use Triton C API to get the value of the metric. + double GetValue(); + + /// Clear the TRITONSERVER_Metric object. + void ClearTritonMetric(); +#endif + + /// Disallow copying the custom metric object. + DISALLOW_COPY_AND_ASSIGN(Metric); + + private: + // The private constructor for creating a Metric object from shared memory. + Metric( + AllocatedSharedMemory& custom_metric_shm, + std::unique_ptr& labels_shm, + std::unique_ptr& buckets); + + // The labels of the metric, which is the identifier of the metric. + std::string labels_; + // Monotonically increasing values representing bucket boundaries for creating + // histogram metric. + std::optional> buckets_; + // The value used for incrementing or setting the metric. + double operation_value_; + // The address of the TRITONSERVER_Metric object. + void* metric_address_; + // The address corresponds to the TRITONSERVER_MetricFamily object that this + // metric belongs to. + void* metric_family_address_; + // Indicates whether the metric has been cleared. It is needed as the Clear()' + // function can be called from two different locations: when the metric family + // clears the 'metric_map_' and when the 'Metric' object goes out of + // scope/being deleted. + bool is_cleared_; + + // Shared Memory Data Structures + AllocatedSharedMemory custom_metric_shm_; + MetricShm* custom_metric_shm_ptr_; + bi::managed_external_buffer::handle_t shm_handle_; + std::unique_ptr labels_shm_; + std::unique_ptr buckets_shm_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/metric_family.cc b/src/metric_family.cc new file mode 100644 index 00000000..222a0e23 --- /dev/null +++ b/src/metric_family.cc @@ -0,0 +1,248 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "metric_family.h" + +#ifdef TRITON_PB_STUB +#include "pb_stub.h" +#endif + +namespace triton { namespace backend { namespace python { + +MetricFamily::MetricFamily( + const std::string& name, const std::string& description, + const MetricKind& kind) + : name_(name), description_(description), kind_(kind), + metric_family_address_(nullptr) +{ +#ifdef TRITON_PB_STUB + SendCreateMetricFamilyRequest(); +#endif +} + +MetricFamily::~MetricFamily() +{ +#ifdef TRITON_PB_STUB + // Clear all the metrics first + { + std::lock_guard lock(metric_map_mu_); + for (auto& m : metric_map_) { + m.second->Clear(); + } + } + + // Send the request to delete the MetricFamily to the parent process + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory custom_metrics_shm; + try { + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricFamilyRequestDelete, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + std::cerr << "Error when deleting MetricFamily: " << pb_exception.what() + << "\n"; + } +#endif +}; + +void +MetricFamily::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + AllocatedSharedMemory custom_metric_family_shm = + shm_pool->Construct(); + + custom_metric_family_shm_ptr_ = custom_metric_family_shm.data_.get(); + std::unique_ptr name_shm = PbString::Create(shm_pool, name_); + std::unique_ptr description_shm = + PbString::Create(shm_pool, description_); + + custom_metric_family_shm_ptr_->kind = kind_; + custom_metric_family_shm_ptr_->name_shm_handle = name_shm->ShmHandle(); + custom_metric_family_shm_ptr_->description_shm_handle = + description_shm->ShmHandle(); + custom_metric_family_shm_ptr_->metric_family_address = metric_family_address_; + + // Save the references to shared memory. + custom_metric_family_shm_ = std::move(custom_metric_family_shm); + name_shm_ = std::move(name_shm); + description_shm_ = std::move(description_shm); + shm_handle_ = custom_metric_family_shm_.handle_; +} + +std::unique_ptr +MetricFamily::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory custom_metric_family_shm = + shm_pool->Load(handle); + MetricFamilyShm* custom_metric_family_shm_ptr = + custom_metric_family_shm.data_.get(); + std::unique_ptr name_shm = PbString::LoadFromSharedMemory( + shm_pool, custom_metric_family_shm_ptr->name_shm_handle); + std::unique_ptr description_shm = PbString::LoadFromSharedMemory( + shm_pool, custom_metric_family_shm_ptr->description_shm_handle); + + return std::unique_ptr( + new MetricFamily(custom_metric_family_shm, name_shm, description_shm)); +} + +MetricFamily::MetricFamily( + AllocatedSharedMemory& custom_metric_family_shm, + std::unique_ptr& name_shm, + std::unique_ptr& description_shm) + : custom_metric_family_shm_(std::move(custom_metric_family_shm)), + name_shm_(std::move(name_shm)), + description_shm_(std::move(description_shm)) +{ + custom_metric_family_shm_ptr_ = custom_metric_family_shm_.data_.get(); + name_ = name_shm_->String(); + description_ = description_shm_->String(); + kind_ = custom_metric_family_shm_ptr_->kind; + metric_family_address_ = custom_metric_family_shm_ptr_->metric_family_address; +} + +void* +MetricFamily::MetricFamilyAddress() +{ + return metric_family_address_; +} + +#ifdef TRITON_PB_STUB +std::shared_ptr +MetricFamily::CreateMetricFamily( + const std::string& name, const std::string& description, + const MetricKind& kind) +{ + std::shared_ptr metric_family = + std::make_shared(name, description, kind); + metric_family->SendCreateMetricFamilyRequest(); + return metric_family; +} + +void +MetricFamily::SendCreateMetricFamilyRequest() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; + AllocatedSharedMemory custom_metrics_shm; + try { + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricFamilyRequestNew, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Error when creating MetricFamily: " + + std::string(pb_exception.what())); + } + + custom_metrics_msg = custom_metrics_shm.data_.get(); + metric_family_address_ = custom_metrics_msg->address; +} + +std::shared_ptr +MetricFamily::CreateMetric(const py::object& labels, const py::object& buckets) +{ + if (!labels.is_none()) { + if (!py::isinstance(labels)) { + throw PythonBackendException( + "Failed to create metric. Labels must be a dictionary."); + } + } + + py::module json = py::module_::import("json"); + std::string labels_str = std::string(py::str(json.attr("dumps")(labels))); + + std::optional> buckets_vec; + if (!buckets.is_none()) { + if (!py::isinstance(buckets)) { + throw PythonBackendException( + "Failed to create metric. Buckets must be a list."); + } + if (kind_ == kCounter || kind_ == kGauge) { + throw PythonBackendException( + "Failed to create metric. Unexpected buckets found."); + } + buckets_vec = buckets.cast>(); + } else { + if (kind_ == kHistogram) { + throw PythonBackendException( + "Failed to create metric. Missing required buckets."); + } + buckets_vec = std::nullopt; + } + + auto metric = + std::make_shared(labels_str, buckets_vec, metric_family_address_); + { + std::lock_guard lock(metric_map_mu_); + metric_map_.insert({metric->MetricAddress(), metric}); + } + + return metric; +} +#else +void* +MetricFamily::InitializeTritonMetricFamily() +{ + TRITONSERVER_MetricKind triton_kind = ToTritonServerMetricKind(kind_); + TRITONSERVER_MetricFamily* triton_metric_family = nullptr; + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricFamilyNew( + &triton_metric_family, triton_kind, name_.c_str(), description_.c_str())); + return reinterpret_cast(triton_metric_family); +} + +TRITONSERVER_MetricKind +MetricFamily::ToTritonServerMetricKind(const MetricKind& kind) +{ + switch (kind) { + case kCounter: + return TRITONSERVER_METRIC_KIND_COUNTER; + case kGauge: + return TRITONSERVER_METRIC_KIND_GAUGE; + case kHistogram: + return TRITONSERVER_METRIC_KIND_HISTOGRAM; + default: + throw PythonBackendException("Unknown metric kind"); + } +} + +void +MetricFamily::ClearTritonMetricFamily() +{ + auto metric_family = + reinterpret_cast(metric_family_address_); + if (metric_family != nullptr) { + LOG_IF_ERROR( + TRITONSERVER_MetricFamilyDelete(metric_family), + "deleting metric family"); + } +} +#endif + +}}} // namespace triton::backend::python diff --git a/src/metric_family.h b/src/metric_family.h new file mode 100644 index 00000000..2b5f86ab --- /dev/null +++ b/src/metric_family.h @@ -0,0 +1,154 @@ +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include "ipc_message.h" +#include "metric.h" +#include "pb_string.h" +#include "pb_utils.h" + +#ifdef TRITON_PB_STUB +#include +namespace py = pybind11; +#else +#include "triton/core/tritonserver.h" +#endif + +namespace triton { namespace backend { namespace python { + +// The 'MetricFamilyShm' struct is utilized by the 'MetricFamily' class for +// saving the essential data to shared memory and for loading the data from +// shared memory in order to reconstruct the 'MetricFamily' object. +struct MetricFamilyShm { + // The shared memory handle of the name in PbString format. + bi::managed_external_buffer::handle_t name_shm_handle; + // The shared memory handle of the description in PbString format. + bi::managed_external_buffer::handle_t description_shm_handle; + // The metric kind of the 'MetricFamily'. + MetricKind kind; + // The address of the 'TRITONSERVER_MetricFamily' object. + void* metric_family_address; +}; + +class MetricFamily { + public: + MetricFamily( + const std::string& name, const std::string& description, + const MetricKind& kind); + + ~MetricFamily(); + + /// Save a custom metric family to shared memory. + /// \param shm_pool Shared memory pool to save the custom metric family. + void SaveToSharedMemory(std::unique_ptr& shm_pool); + + /// Create a Custom Metric Family object from shared memory. + /// \param shm_pool Shared memory pool + /// \param handle Shared memory handle of the custom metric family. + /// \return Returns the custom metric family in the specified handle + /// location. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); + + /// Get the address of the TRITONSERVER_MetricFamily object. + /// \return Returns the address of the TRITONSERVER_MetricFamily object. + void* MetricFamilyAddress(); + +#ifdef TRITON_PB_STUB + /// Create a metric family object and returned as a shared pointer. + /// \param name The name of the metric family. + /// \param description The description of the metric family. + /// \param kind The metric kind of the metric family. + /// \return Returns the shared pointer to the created metric family. + static std::shared_ptr CreateMetricFamily( + const std::string& name, const std::string& description, + const MetricKind& kind); + + /// Send a request to register a new 'TRITONSERVER_MetricFamily' object to the + /// parent process. + void SendCreateMetricFamilyRequest(); + + /// Create a metric from the metric family and store it in the metric map. + /// \param labels The labels of the metric. + /// \param buckets Monotonically increasing values representing bucket + /// boundaries for creating histogram metric. + /// \return Returns the shared pointer to the created metric. + std::shared_ptr CreateMetric( + const py::object& labels, const py::object& buckets); +#else + /// Initialize the TRITONSERVER_MetricFamily object. + /// \return Returns the address of the TRITONSERVER_MetricFamily object. + void* InitializeTritonMetricFamily(); + + /// Helper function to convert the MetricKind enum to TRITONSERVER_MetricKind + /// \param kind The MetricKind enum to be converted. + /// \return Returns the TRITONSERVER_MetricKind enum. + TRITONSERVER_MetricKind ToTritonServerMetricKind(const MetricKind& kind); + + /// Clear the TRITONSERVER_MetricFamily object. + void ClearTritonMetricFamily(); +#endif + + /// Disallow copying the metric family object. + DISALLOW_COPY_AND_ASSIGN(MetricFamily); + + private: + // The private constructor for creating a MetricFamily object from shared + // memory. + MetricFamily( + AllocatedSharedMemory& custom_metric_family_shm, + std::unique_ptr& name_shm, + std::unique_ptr& description_shm); + + // The name of the metric family. + std::string name_; + // The description of the metric family. + std::string description_; + // The metric kind of the metric family. Currently only supports GAUGE, + // COUNTER and HISTOGRAM. + MetricKind kind_; + // The address of the TRITONSERVER_MetricFamily object. + void* metric_family_address_; + + // The mutex to protect the 'metric_map_'. + std::mutex metric_map_mu_; + // Need to keep track of the metrics associated with the metric family to make + // sure the metrics are cleaned up before the metric family is deleted. + std::unordered_map> metric_map_; + + // Shared Memory Data Structures + AllocatedSharedMemory custom_metric_family_shm_; + MetricFamilyShm* custom_metric_family_shm_ptr_; + bi::managed_external_buffer::handle_t shm_handle_; + std::unique_ptr name_shm_; + std::unique_ptr description_shm_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/model_loader.cc b/src/model_loader.cc new file mode 100644 index 00000000..0be45fa5 --- /dev/null +++ b/src/model_loader.cc @@ -0,0 +1,267 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#include "model_loader.h" + +#ifdef TRITON_PB_STUB +#include "pb_stub.h" +#endif + +namespace triton { namespace backend { namespace python { + +void +ModelLoader::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + AllocatedSharedMemory model_loader_req_shm = + shm_pool->Construct(); + model_loader_req_shm_ptr_ = model_loader_req_shm.data_.get(); + + std::unique_ptr name_shm = PbString::Create(shm_pool, name_); + std::unique_ptr version_shm = PbString::Create(shm_pool, version_); + std::unique_ptr config_shm = PbString::Create(shm_pool, config_); + std::unique_ptr files_shm = PbMap::Create(shm_pool, files_); + + model_loader_req_shm_ptr_->name_shm_handle = name_shm->ShmHandle(); + model_loader_req_shm_ptr_->version_shm_handle = version_shm->ShmHandle(); + model_loader_req_shm_ptr_->config_shm_handle = config_shm->ShmHandle(); + model_loader_req_shm_ptr_->files_shm_handle = files_shm->ShmHandle(); + model_loader_req_shm_ptr_->unload_dependents = unload_dependents_; + + // Save the references to shared memory. + model_loader_req_shm_ = std::move(model_loader_req_shm); + name_shm_ = std::move(name_shm); + version_shm_ = std::move(version_shm); + config_shm_ = std::move(config_shm); + files_shm_ = std::move(files_shm); + + shm_handle_ = model_loader_req_shm_.handle_; +} + +std::unique_ptr +ModelLoader::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory model_loader_req_shm = + shm_pool->Load(handle); + ModelLoaderRequestShm* model_loader_req_shm_ptr = + model_loader_req_shm.data_.get(); + + std::unique_ptr name_shm = PbString::LoadFromSharedMemory( + shm_pool, model_loader_req_shm_ptr->name_shm_handle); + std::unique_ptr version_shm = PbString::LoadFromSharedMemory( + shm_pool, model_loader_req_shm_ptr->version_shm_handle); + std::unique_ptr config_shm = PbString::LoadFromSharedMemory( + shm_pool, model_loader_req_shm_ptr->config_shm_handle); + std::unique_ptr files_shm = PbMap::LoadFromSharedMemory( + shm_pool, model_loader_req_shm_ptr->files_shm_handle); + + return std::unique_ptr(new ModelLoader( + model_loader_req_shm, name_shm, version_shm, config_shm, files_shm)); +} + +ModelLoader::ModelLoader( + AllocatedSharedMemory& model_loader_req_shm, + std::unique_ptr& name_shm, std::unique_ptr& version_shm, + std::unique_ptr& config_shm, std::unique_ptr& files_shm) + : model_loader_req_shm_(std::move(model_loader_req_shm)), + name_shm_(std::move(name_shm)), version_shm_(std::move(version_shm)), + config_shm_(std::move(config_shm)), files_shm_(std::move(files_shm)) +{ + model_loader_req_shm_ptr_ = model_loader_req_shm_.data_.get(); + name_ = name_shm_->String(); + version_ = version_shm_->String(); + config_ = config_shm_->String(); + files_ = files_shm_->UnorderedMap(); + unload_dependents_ = model_loader_req_shm_ptr_->unload_dependents; +} +#ifdef TRITON_PB_STUB +void +ModelLoader::SendLoadModelRequest() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory model_loader_msg_shm; + + try { + stub->SendMessage( + model_loader_msg_shm, PYTHONSTUB_LoadModelRequest, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to load model: " + std::string(pb_exception.what())); + } +} + +void +ModelLoader::SendUnloadModelRequest() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory model_loader_msg_shm; + try { + stub->SendMessage( + model_loader_msg_shm, PYTHONSTUB_UnloadModelRequest, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to unload model: " + std::string(pb_exception.what())); + } +} + +bool +ModelLoader::SendModelReadinessRequest() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + ModelLoaderMessage* model_loader_msg = nullptr; + AllocatedSharedMemory model_loader_msg_shm; + try { + stub->SendMessage( + model_loader_msg_shm, PYTHONSTUB_ModelReadinessRequest, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to check model readiness: " + std::string(pb_exception.what())); + } + + model_loader_msg = model_loader_msg_shm.data_.get(); + return model_loader_msg->is_model_ready; +} + +void +LoadModel( + const std::string& name, const std::string& config, const py::object& files) +{ + std::unordered_map files_map; + + if (!files.is_none()) { + if (!py::isinstance(files)) { + throw PythonBackendException( + "failed to load model '" + name + + "', files should be a dictionary of file paths and file contents"); + } + + py::dict files_dict = py::cast(files); + for (const auto& item : files_dict) { + std::string key = py::cast(item.first); + py::bytes value = py::cast(item.second); + std::string content(value); + files_map[key] = content; + } + } + + ModelLoader model_loader(name, config, files_map); + model_loader.SendLoadModelRequest(); +} + +void +UnloadModel(const std::string& name, const bool unload_dependents) +{ + ModelLoader model_loader(name, unload_dependents); + model_loader.SendUnloadModelRequest(); +} + +bool +IsModelReady(const std::string& name, const std::string& version) +{ + ModelLoader model_loader(name, version); + return model_loader.SendModelReadinessRequest(); +} +#else +void +ModelLoader::LoadModel(TRITONSERVER_Server* server) +{ + std::string path = ""; + std::string file_content = ""; + std::vector const_params; + if (!config_.empty()) { + const_params.emplace_back(TRITONSERVER_ParameterNew( + "config", TRITONSERVER_PARAMETER_STRING, config_.c_str())); + } + if (!files_.empty()) { + for (auto& file : files_) { + path = file.first; + file_content = file.second; + const_params.emplace_back(TRITONSERVER_ParameterBytesNew( + path.c_str(), file_content.data(), file_content.size())); + } + } + + THROW_IF_TRITON_ERROR(TRITONSERVER_ServerLoadModelWithParameters( + server, name_.c_str(), const_params.data(), const_params.size())); + + for (const auto param : const_params) { + TRITONSERVER_ParameterDelete(const_cast(param)); + } +} + +void +ModelLoader::UnloadModel(TRITONSERVER_Server* server) +{ + if (unload_dependents_) { + THROW_IF_TRITON_ERROR( + TRITONSERVER_ServerUnloadModelAndDependents(server, name_.c_str())); + } else { + THROW_IF_TRITON_ERROR( + TRITONSERVER_ServerUnloadModel(server, name_.c_str())); + } +} + +bool +ModelLoader::IsModelReady(TRITONSERVER_Server* server) +{ + bool is_ready = false; + int64_t model_version = GetModelVersionFromString(version_); + THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelIsReady( + server, name_.c_str(), model_version, &is_ready)); + return is_ready; +} + +int64_t +ModelLoader::GetModelVersionFromString(const std::string& version_string) +{ + int64_t version = -1; + if (!version_string.empty()) { + try { + version = std::stol(version_string); + } + catch (std::exception& e) { + throw PythonBackendException( + "failed to get model version from specified version string '" + + version_string + "' (details: " + e.what() + + "), version should be an integral value > 0"); + } + + if (version < 0) { + throw PythonBackendException( + "failed to get model version from specified version string '" + + version_string + "', version should be an integral value > 0"); + } + } + return version; +} +#endif +}}} // namespace triton::backend::python diff --git a/src/model_loader.h b/src/model_loader.h new file mode 100644 index 00000000..e4fe9fd6 --- /dev/null +++ b/src/model_loader.h @@ -0,0 +1,165 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include + +#include "ipc_message.h" +#include "pb_map.h" +#include "pb_string.h" +#include "pb_utils.h" + +#ifdef TRITON_PB_STUB +#include +namespace py = pybind11; +#else +#include "triton/core/tritonserver.h" +#endif + +namespace triton { namespace backend { namespace python { + +// The 'ModelLoaderRequestShm' struct is utilized by the 'ModelLoader' class for +// saving the essential data to shared memory and for loading the data from +// shared memory in order to reconstruct the 'ModelLoader' object. +struct ModelLoaderRequestShm { + // The shared memory handle of the model name in PbString format. + bi::managed_external_buffer::handle_t name_shm_handle; + // The shared memory handle of the model version in PbString format. + bi::managed_external_buffer::handle_t version_shm_handle; + // The flag to unload the dependent models. + bool unload_dependents; + // The shared memory handle of the config in PbString format. + bi::managed_external_buffer::handle_t config_shm_handle; + // The shared memory handle of the files in PbMap format. + bi::managed_external_buffer::handle_t files_shm_handle; +}; + +class ModelLoader { + public: + ModelLoader( + const std::string& name, const std::string& config, + const std::unordered_map& files) + : name_(name), version_(""), config_(config), files_(files), + unload_dependents_(false) + { + } + + ModelLoader(const std::string& name, const bool unload_dependents) + : name_(name), version_(""), config_(""), files_({}), + unload_dependents_(unload_dependents) + { + } + + ModelLoader(const std::string& name, const std::string& version) + : name_(name), version_(version), config_(""), files_({}), + unload_dependents_(false) + { + } + + /// Save ModelLoader object to shared memory. + /// \param shm_pool Shared memory pool to save the ModelLoader object. + void SaveToSharedMemory(std::unique_ptr& shm_pool); + + /// Create a ModelLoader object from shared memory. + /// \param shm_pool Shared memory pool + /// \param handle Shared memory handle of the ModelLoader. + /// \return Returns the ModelLoaders in the specified request_handle + /// location. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); +#ifdef TRITON_PB_STUB + /// Send a request to load the model. + void SendLoadModelRequest(); + + /// Send a request to unload the model. + void SendUnloadModelRequest(); + + /// Send a request to check if the model is ready. + bool SendModelReadinessRequest(); +#else + /// Use Triton C API to load the model. + /// \param server The Triton server object. + void LoadModel(TRITONSERVER_Server* server); + + /// Use Triton C API to unload the model. + /// \param server The Triton server object. + void UnloadModel(TRITONSERVER_Server* server); + + /// Use Triton C API to check if the model is ready. + /// \param server The Triton server object. + /// \return Returns true if the model is ready. + bool IsModelReady(TRITONSERVER_Server* server); + + /// Get the model version from the version string. + /// \param version_string The version string. + /// \return Returns the model version in uint64_t. + int64_t GetModelVersionFromString(const std::string& version_string); +#endif + /// Disallow copying the ModelLoader object. + DISALLOW_COPY_AND_ASSIGN(ModelLoader); + + private: + // The private constructor for creating a Metric object from shared memory. + ModelLoader( + AllocatedSharedMemory& model_loader_req_shm, + std::unique_ptr& name_shm, + std::unique_ptr& version_shm, + std::unique_ptr& config_shm, std::unique_ptr& files_shm); + + // The name of the model. + std::string name_; + // The version of the model. + std::string version_; + // The configuration of the model. + std::string config_; + // The files of the model. + std::unordered_map files_; + // The flag to unload the dependent models. + bool unload_dependents_; + + // // Shared Memory Data Structures + AllocatedSharedMemory model_loader_req_shm_; + ModelLoaderRequestShm* model_loader_req_shm_ptr_; + bi::managed_external_buffer::handle_t shm_handle_; + std::unique_ptr name_shm_; + std::unique_ptr version_shm_; + std::unique_ptr config_shm_; + std::unique_ptr files_shm_; +}; + +#ifdef TRITON_PB_STUB +// The binding functions for the Python stub. +void LoadModel( + const std::string& name, const std::string& config, + const py::object& files = py::none()); +void UnloadModel(const std::string& name, const bool unload_dependents); +bool IsModelReady(const std::string& name, const std::string& version); +#endif + +}}}; // namespace triton::backend::python diff --git a/src/pb_bls_cancel.cc b/src/pb_bls_cancel.cc new file mode 100644 index 00000000..4341c037 --- /dev/null +++ b/src/pb_bls_cancel.cc @@ -0,0 +1,93 @@ +// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "pb_bls_cancel.h" + +#include "pb_stub.h" +#include "pb_stub_log.h" + +namespace triton { namespace backend { namespace python { + +void +PbBLSCancel::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + cancel_shm_ = shm_pool->Construct(); + new (&(cancel_shm_.data_->mu)) bi::interprocess_mutex; + new (&(cancel_shm_.data_->cv)) bi::interprocess_condition; + cancel_shm_.data_->waiting_on_stub = false; + cancel_shm_.data_->infer_payload_id = infer_playload_id_; + cancel_shm_.data_->is_cancelled = is_cancelled_; +} + +bi::managed_external_buffer::handle_t +PbBLSCancel::ShmHandle() +{ + return cancel_shm_.handle_; +} + +CancelBLSRequestMessage* +PbBLSCancel::ShmPayload() +{ + return cancel_shm_.data_.get(); +} + +void +PbBLSCancel::Cancel() +{ + // Release the GIL. Python objects are not accessed during the check. + py::gil_scoped_release gil_release; + + std::unique_lock lk(mu_); + // The cancelled flag can only move from false to true, not the other way, so + // it is checked on each query until cancelled and then implicitly cached. + if (is_cancelled_) { + return; + } + if (!updating_) { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + if (!stub->StubToParentServiceActive()) { + LOG_ERROR << "Cannot communicate with parent service"; + return; + } + + stub->EnqueueCancelBLSRequest(this); + updating_ = true; + } + cv_.wait(lk, [this] { return !updating_; }); +} + +void +PbBLSCancel::ReportIsCancelled(bool is_cancelled) +{ + { + std::lock_guard lk(mu_); + is_cancelled_ = is_cancelled; + updating_ = false; + } + cv_.notify_all(); +} + +}}} // namespace triton::backend::python diff --git a/src/pb_bls_cancel.h b/src/pb_bls_cancel.h new file mode 100644 index 00000000..7fdd3fbf --- /dev/null +++ b/src/pb_bls_cancel.h @@ -0,0 +1,63 @@ +// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include + +#include "pb_utils.h" + +namespace triton { namespace backend { namespace python { + +class PbBLSCancel { + public: + PbBLSCancel(void* infer_playload_id) + : updating_(false), infer_playload_id_(infer_playload_id), + is_cancelled_(false) + { + } + DISALLOW_COPY_AND_ASSIGN(PbBLSCancel); + + void SaveToSharedMemory(std::unique_ptr& shm_pool); + bi::managed_external_buffer::handle_t ShmHandle(); + CancelBLSRequestMessage* ShmPayload(); + + void Cancel(); + void ReportIsCancelled(bool is_cancelled); + + private: + AllocatedSharedMemory cancel_shm_; + + std::mutex mu_; + std::condition_variable cv_; + bool updating_; + + void* infer_playload_id_; + bool is_cancelled_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc new file mode 100644 index 00000000..da9daf98 --- /dev/null +++ b/src/pb_cancel.cc @@ -0,0 +1,94 @@ +// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "pb_cancel.h" + +#include "pb_stub.h" +#include "pb_stub_log.h" + +namespace triton { namespace backend { namespace python { + +void +PbCancel::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + cancel_shm_ = shm_pool->Construct(); + new (&(cancel_shm_.data_->mu)) bi::interprocess_mutex; + new (&(cancel_shm_.data_->cv)) bi::interprocess_condition; + cancel_shm_.data_->waiting_on_stub = false; + cancel_shm_.data_->response_factory_address = response_factory_address_; + cancel_shm_.data_->request_address = request_address_; + cancel_shm_.data_->is_cancelled = is_cancelled_; +} + +bi::managed_external_buffer::handle_t +PbCancel::ShmHandle() +{ + return cancel_shm_.handle_; +} + +IsCancelledMessage* +PbCancel::ShmPayload() +{ + return cancel_shm_.data_.get(); +} + +bool +PbCancel::IsCancelled() +{ + // Release the GIL. Python objects are not accessed during the check. + py::gil_scoped_release gil_release; + + std::unique_lock lk(mu_); + // The cancelled flag can only move from false to true, not the other way, so + // it is checked on each query until cancelled and then implicitly cached. + if (is_cancelled_) { + return is_cancelled_; + } + if (!updating_) { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + if (!stub->StubToParentServiceActive()) { + LOG_ERROR << "Cannot communicate with parent service"; + return false; + } + stub->EnqueueIsCancelled(this); + updating_ = true; + } + cv_.wait(lk, [this] { return !updating_; }); + return is_cancelled_; +} + +void +PbCancel::ReportIsCancelled(bool is_cancelled) +{ + { + std::lock_guard lk(mu_); + is_cancelled_ = is_cancelled; + updating_ = false; + } + cv_.notify_all(); +} + +}}} // namespace triton::backend::python diff --git a/src/pb_cancel.h b/src/pb_cancel.h new file mode 100644 index 00000000..3ebf07b5 --- /dev/null +++ b/src/pb_cancel.h @@ -0,0 +1,64 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include + +#include "pb_utils.h" + +namespace triton { namespace backend { namespace python { + +class PbCancel { + public: + PbCancel(intptr_t response_factory_address, intptr_t request_address) + : updating_(false), response_factory_address_(response_factory_address), + request_address_(request_address), is_cancelled_(false) + { + } + DISALLOW_COPY_AND_ASSIGN(PbCancel); + + void SaveToSharedMemory(std::unique_ptr& shm_pool); + bi::managed_external_buffer::handle_t ShmHandle(); + IsCancelledMessage* ShmPayload(); + + bool IsCancelled(); + void ReportIsCancelled(bool is_cancelled); + + private: + AllocatedSharedMemory cancel_shm_; + + std::mutex mu_; + std::condition_variable cv_; + bool updating_; + + intptr_t response_factory_address_; + intptr_t request_address_; + bool is_cancelled_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/pb_env.cc b/src/pb_env.cc index 2065e6db..d9643a62 100644 --- a/src/pb_env.cc +++ b/src/pb_env.cc @@ -26,17 +26,45 @@ #include "pb_env.h" +#ifndef _WIN32 #include #include #include +#endif +#include + #include #include #include + #include "pb_utils.h" namespace triton { namespace backend { namespace python { +bool +FileExists(std::string& path) +{ + struct stat buffer; + return stat(path.c_str(), &buffer) == 0; +} + +void +LastModifiedTime(const std::string& path, time_t* last_modified_time) +{ + struct stat result; + if (stat(path.c_str(), &result) == 0) { + *last_modified_time = result.st_mtime; + } else { + throw PythonBackendException(std::string( + "LastModifiedTime() failed as file \'" + path + + std::string("\' does not exists."))); + } +} + +// FIXME: [DLIS-5969]: Develop platforom-agnostic functions +// to support custom python environments. +#ifndef _WIN32 void CopySingleArchiveEntry(archive* input_archive, archive* output_archive) { @@ -70,7 +98,6 @@ CopySingleArchiveEntry(archive* input_archive, archive* output_archive) } } - void ExtractTarFile(std::string& archive_path, std::string& dst_path) { @@ -150,27 +177,6 @@ ExtractTarFile(std::string& archive_path, std::string& dst_path) } } -bool -FileExists(std::string& path) -{ - struct stat buffer; - return stat(path.c_str(), &buffer) == 0; -} - -void -LastModifiedTime(const std::string& path, time_t* last_modified_time) -{ - struct stat result; - if (stat(path.c_str(), &result) == 0) { - *last_modified_time = result.st_mtime; - } else { - throw PythonBackendException(std::string( - "LastModifiedTime() failed as file \'" + path + - std::string("\' does not exists."))); - } -} - - void RecursiveDirectoryDelete(const char* dir) { @@ -251,6 +257,21 @@ EnvironmentManager::ExtractIfNotExtracted(std::string env_path) bool env_extracted = false; bool re_extraction = false; + + // If the path is not a conda-packed file, then bypass the extraction process + struct stat info; + if (stat(canonical_env_path, &info) != 0) { + throw PythonBackendException( + std::string("stat() of : ") + canonical_env_path + " returned error."); + } else if (S_ISDIR(info.st_mode)) { + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("Returning canonical path since EXECUTION_ENV_PATH does " + "not contain compressed path. Path: ") + + canonical_env_path) + .c_str()); + return canonical_env_path; + } const auto env_itr = env_map_.find(canonical_env_path); if (env_itr != env_map_.end()) { // Check if the environment has been modified and would @@ -308,5 +329,6 @@ EnvironmentManager::~EnvironmentManager() { RecursiveDirectoryDelete(base_path_); } +#endif }}} // namespace triton::backend::python diff --git a/src/pb_env.h b/src/pb_env.h index 668d05ef..04e01fa3 100644 --- a/src/pb_env.h +++ b/src/pb_env.h @@ -30,6 +30,11 @@ #include #include +#ifdef WIN32 +#include +#undef PATH_MAX +#define PATH_MAX MAX_PATH +#endif namespace triton { namespace backend { namespace python { void ExtractTarFile(std::string& archive_path, std::string& dst_path); @@ -39,6 +44,7 @@ bool FileExists(std::string& path); // // A class that manages Python environments // +#ifndef _WIN32 class EnvironmentManager { std::map> env_map_; char base_path_[PATH_MAX + 1]; @@ -52,5 +58,6 @@ class EnvironmentManager { std::string ExtractIfNotExtracted(std::string env_path); ~EnvironmentManager(); }; +#endif -}}} // namespace triton::backend::python \ No newline at end of file +}}} // namespace triton::backend::python diff --git a/src/pb_error.cc b/src/pb_error.cc index e190af42..0e5d0bd4 100644 --- a/src/pb_error.cc +++ b/src/pb_error.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,6 +27,13 @@ #include "pb_error.h" namespace triton { namespace backend { namespace python { + +TRITONSERVER_Error_Code +PbError::Code() +{ + return code_; +} + const std::string& PbError::Message() { @@ -43,7 +50,10 @@ void PbError::SaveToSharedMemory(std::unique_ptr& shm_pool) { message_shm_ = PbString::Create(shm_pool, message_); - shm_handle_ = message_shm_->ShmHandle(); + error_shm_ = shm_pool->Construct(); + error_shm_.data_->code = code_; + error_shm_.data_->message_shm_handle = message_shm_->ShmHandle(); + shm_handle_ = error_shm_.handle_; } std::shared_ptr @@ -51,14 +61,25 @@ PbError::LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t shm_handle) { - std::unique_ptr message_shm = - PbString::LoadFromSharedMemory(shm_pool, shm_handle); - return std::shared_ptr(new PbError(message_shm)); + AllocatedSharedMemory error_shm = + shm_pool->Load(shm_handle); + std::unique_ptr message_shm = PbString::LoadFromSharedMemory( + shm_pool, error_shm.data_->message_shm_handle); + + TRITONSERVER_Error_Code code = error_shm.data_->code; + std::string message = message_shm->String(); + + return std::shared_ptr(new PbError( + std::move(message_shm), std::move(error_shm), code, std::move(message))); } -PbError::PbError(std::unique_ptr& message_shm) +PbError::PbError( + std::shared_ptr&& message_shm, + AllocatedSharedMemory&& error_shm, TRITONSERVER_Error_Code code, + std::string&& message) + : message_shm_(std::move(message_shm)), error_shm_(std::move(error_shm)), + code_(code), message_(std::move(message)) { - message_shm_ = std::move(message_shm); - message_ = message_shm_->String(); } + }}} // namespace triton::backend::python diff --git a/src/pb_error.h b/src/pb_error.h index d4461082..6001459a 100644 --- a/src/pb_error.h +++ b/src/pb_error.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,25 +27,49 @@ #pragma once #include + #include "pb_string.h" #include "pb_utils.h" namespace triton { namespace backend { namespace python { + +struct PbErrorShm { + TRITONSERVER_Error_Code code; + bi::managed_external_buffer::handle_t message_shm_handle; +}; + class PbError { public: - PbError(const std::string& message) : message_(message) {} + PbError( + const std::string& message, + TRITONSERVER_Error_Code code = TRITONSERVER_ERROR_INTERNAL) + : code_(code), message_(message) + { + } + DISALLOW_COPY_AND_ASSIGN(PbError); + + TRITONSERVER_Error_Code Code(); const std::string& Message(); + void SaveToSharedMemory(std::unique_ptr& shm_pool); bi::managed_external_buffer::handle_t ShmHandle(); + static std::shared_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t handle); - DISALLOW_COPY_AND_ASSIGN(PbError); private: - PbError(std::unique_ptr& pb_error); - std::string message_; + PbError( + std::shared_ptr&& message_shm, + AllocatedSharedMemory&& error_shm, + TRITONSERVER_Error_Code code, std::string&& message); + std::shared_ptr message_shm_; + AllocatedSharedMemory error_shm_; bi::managed_external_buffer::handle_t shm_handle_; + + TRITONSERVER_Error_Code code_; + std::string message_; }; + }}}; // namespace triton::backend::python diff --git a/src/pb_log.h b/src/pb_log.h index 62c24aa6..65d41009 100644 --- a/src/pb_log.h +++ b/src/pb_log.h @@ -27,13 +27,13 @@ #pragma once #include + #include "pb_string.h" #include "pb_utils.h" namespace triton { namespace backend { namespace python { class PbLog { public: - /// Create a PbLog instance PbLog( const std::string& filename, uint32_t line, const std::string& message, @@ -65,7 +65,7 @@ class PbLogShm { std::unique_ptr& shm_pool, const std::string& filename, const uint32_t& line, const std::string& message, const LogLevel& level); - + /// Load PbLog object to shared memory static std::unique_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, diff --git a/src/pb_map.h b/src/pb_map.h index c4827b7c..a231b719 100644 --- a/src/pb_map.h +++ b/src/pb_map.h @@ -27,6 +27,7 @@ #pragma once #include + #include "pb_string.h" #include "shm_manager.h" diff --git a/src/pb_memory.cc b/src/pb_memory.cc index 2354391f..5b678f1a 100644 --- a/src/pb_memory.cc +++ b/src/pb_memory.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,6 +26,8 @@ #include "pb_memory.h" +#include + namespace triton { namespace backend { namespace python { std::unique_ptr @@ -35,7 +37,6 @@ PbMemory::Create( uint64_t byte_size, char* data, bool copy_gpu) { size_t requested_byte_size = sizeof(MemoryShm); - if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU requested_byte_size += sizeof(cudaIpcMemHandle_t); @@ -46,9 +47,10 @@ PbMemory::Create( AllocatedSharedMemory memory_shm = shm_pool->Construct(requested_byte_size); + PbMemory::FillShmData( - memory_type, memory_type_id, byte_size, data, memory_shm.data_.get(), - memory_shm.handle_, copy_gpu); + shm_pool->GetCUDAMemoryPoolManager(), memory_type, memory_type_id, + byte_size, data, memory_shm.data_.get(), memory_shm.handle_, copy_gpu); if (memory_type == TRITONSERVER_MEMORY_CPU) { data = memory_shm.data_.get() + sizeof(MemoryShm); @@ -83,12 +85,14 @@ PbMemory::Create( std::unique_ptr PbMemory::Create( + std::unique_ptr& shm_pool, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, uint64_t byte_size, char* data, char* data_shm, bi::managed_external_buffer::handle_t handle, bool copy_gpu) { PbMemory::FillShmData( - memory_type, memory_type_id, byte_size, data, data_shm, handle, copy_gpu); + shm_pool->GetCUDAMemoryPoolManager(), memory_type, memory_type_id, + byte_size, data, data_shm, handle, copy_gpu); if (memory_type == TRITONSERVER_MEMORY_CPU) { data = data_shm + sizeof(MemoryShm); @@ -141,8 +145,16 @@ PbMemory::CopyBuffer( kind = cudaMemcpyDeviceToDevice; } - cudaError_t err = - cudaMemcpy(dst->DataPtr(), src->DataPtr(), src->ByteSize(), kind); + cudaError_t err; + if ((kind == cudaMemcpyDeviceToDevice) && + (src->MemoryTypeId() != dst->MemoryTypeId())) { + err = cudaMemcpyPeer( + dst->DataPtr(), dst->MemoryTypeId(), src->DataPtr(), + src->MemoryTypeId(), src->ByteSize()); + + } else { + err = cudaMemcpy(dst->DataPtr(), src->DataPtr(), src->ByteSize(), kind); + } if (err != cudaSuccess) { throw PythonBackendException( @@ -150,31 +162,51 @@ PbMemory::CopyBuffer( "failed to copy data: " + std::string(cudaGetErrorString(err))) .c_str()); } + + if (kind == cudaMemcpyDeviceToDevice) { + // Synchronize the default stream for d2d copies. + // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html#api-sync-behavior__memcpy-sync + err = cudaStreamSynchronize(0); + if (err != cudaSuccess) { + throw PythonBackendException( + std::string( + "failed to synchronize the default CUDA stream. error: " + + std::string(cudaGetErrorString(err))) + .c_str()); + } + } #endif } void PbMemory::FillShmData( + std::unique_ptr& cuda_pool, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, uint64_t byte_size, char* data, char* data_shm, bi::managed_external_buffer::handle_t handle, bool copy_gpu) { char* memory_data_shm = data_shm + sizeof(MemoryShm); MemoryShm* memory_shm_ptr = reinterpret_cast(data_shm); - memory_shm_ptr->is_cuda_handle_set = copy_gpu; memory_shm_ptr->memory_release_id = 0; + bool use_cuda_shared_pool = false; if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU if (data != nullptr) { if (copy_gpu) { - // [FIXME] Restore the previous device - THROW_IF_CUDA_ERROR(cudaSetDevice(memory_type_id)); + ScopedSetDevice scoped_set_device(memory_type_id); THROW_IF_CUDA_ERROR(cudaIpcGetMemHandle( reinterpret_cast(memory_data_shm), data)); } + if (cuda_pool->UseCudaSharedPool(memory_type_id) && + IsUsingCUDAPool(cuda_pool, memory_type_id, data)) { + use_cuda_shared_pool = true; + memory_shm_ptr->cuda_pool_offset = + data - + reinterpret_cast(cuda_pool->CUDAPoolAddress(memory_type_id)); + } } -#endif +#endif // TRITON_ENABLE_GPU } else { if (data != nullptr) { std::copy(data, data + byte_size, memory_data_shm); @@ -184,45 +216,69 @@ PbMemory::FillShmData( memory_shm_ptr->byte_size = byte_size; memory_shm_ptr->memory_type_id = memory_type_id; memory_shm_ptr->memory_type = memory_type; + memory_shm_ptr->use_cuda_shared_pool = use_cuda_shared_pool; } std::unique_ptr PbMemory::LoadFromSharedMemory( + std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t handle, char* data_shm, bool open_cuda_handle) { MemoryShm* memory_shm_ptr = reinterpret_cast(data_shm); char* memory_data_shm = data_shm + sizeof(MemoryShm); - char* data_ptr = nullptr; bool opened_cuda_ipc_handle = false; if (memory_shm_ptr->memory_type == TRITONSERVER_MEMORY_GPU && open_cuda_handle) { #ifdef TRITON_ENABLE_GPU - cudaIpcMemHandle_t* cuda_handle = - reinterpret_cast(memory_data_shm); + if (memory_shm_ptr->use_cuda_shared_pool) { + // When CUDA shared memory pool is used, the stub will retrieve the + // data pointer using the offset. + data_ptr = + (reinterpret_cast( + shm_pool->GetCUDAMemoryPoolManager()->CUDAPoolAddress( + memory_shm_ptr->memory_type_id)) + + memory_shm_ptr->cuda_pool_offset); + } else { + cudaIpcMemHandle_t* cuda_handle = + reinterpret_cast(memory_data_shm); - // The pointer opened by the cudaIpcOpenMemHandle will refer to the base - // address. We need to manually correct the offset. - void* data_ptr_base; - CUDAHandler& cuda_handler = CUDAHandler::getInstance(); - cuda_handler.OpenCudaHandle( - memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); + // The pointer opened by the cudaIpcOpenMemHandle will refer to the base + // address. We need to manually correct the offset. + void* data_ptr_base; + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + cuda_handler.OpenCudaHandle( + memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); - data_ptr = - (reinterpret_cast(data_ptr_base) + - memory_shm_ptr->gpu_pointer_offset); - opened_cuda_ipc_handle = true; -#endif + data_ptr = + (reinterpret_cast(data_ptr_base) + + memory_shm_ptr->gpu_pointer_offset); + opened_cuda_ipc_handle = true; + } + +#endif // TRITON_ENABLE_GPU } else { data_ptr = memory_data_shm; } + + // This check only validates CPU shared memory access. + if (memory_shm_ptr->memory_type != TRITONSERVER_MEMORY_GPU && + (data_ptr + memory_shm_ptr->byte_size > + (char*)shm_pool->GetBaseAddress() + shm_pool->GetCurrentCapacity())) { + std::ostringstream oss; + oss << "0x" << std::hex + << (reinterpret_cast(data_ptr) + memory_shm_ptr->byte_size); + throw PythonBackendException( + std::string("Attempted to access out of bounds memory address ") + + oss.str()); + } + return std::unique_ptr(new PbMemory( data_shm, data_ptr, handle, opened_cuda_ipc_handle /* opened_cuda_ipc_handle */)); } - std::unique_ptr PbMemory::LoadFromSharedMemory( std::unique_ptr& shm_pool, @@ -238,26 +294,48 @@ PbMemory::LoadFromSharedMemory( if (memory_shm_ptr->memory_type == TRITONSERVER_MEMORY_GPU) { if (memory_shm_ptr->byte_size > 0 && open_cuda_handle) { #ifdef TRITON_ENABLE_GPU - cudaIpcMemHandle_t* cuda_handle = - reinterpret_cast(memory_data_shm); - - // The pointer opened by the cudaIpcOpenMemHandle will refer to the base - // address. We need to manually correct the offset. - - void* data_ptr_base; - CUDAHandler& cuda_handler = CUDAHandler::getInstance(); - cuda_handler.OpenCudaHandle( - memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); - - data_ptr = - (reinterpret_cast(data_ptr_base) + - memory_shm_ptr->gpu_pointer_offset); - opened_cuda_ipc_handle = true; + if (memory_shm_ptr->use_cuda_shared_pool) { + // When CUDA shared memory pool is used, the stub will retrieve the + // data pointer using the offset. + data_ptr = + (reinterpret_cast( + shm_pool->GetCUDAMemoryPoolManager()->CUDAPoolAddress( + memory_shm_ptr->memory_type_id)) + + memory_shm_ptr->cuda_pool_offset); + } else { + cudaIpcMemHandle_t* cuda_handle = + reinterpret_cast(memory_data_shm); + + // The pointer opened by the cudaIpcOpenMemHandle will refer to the base + // address. We need to manually correct the offset. + void* data_ptr_base; + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + cuda_handler.OpenCudaHandle( + memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); + + data_ptr = + (reinterpret_cast(data_ptr_base) + + memory_shm_ptr->gpu_pointer_offset); + opened_cuda_ipc_handle = true; + } #endif } } else { data_ptr = memory_data_shm; } + + // This check only validates CPU shared memory access. + if (memory_shm_ptr->memory_type != TRITONSERVER_MEMORY_GPU && + (data_ptr + memory_shm_ptr->byte_size > + (char*)shm_pool->GetBaseAddress() + shm_pool->GetCurrentCapacity())) { + std::ostringstream oss; + oss << "0x" << std::hex + << (reinterpret_cast(data_ptr) + memory_shm_ptr->byte_size); + throw PythonBackendException( + std::string("Attempted to access out of bounds memory address ") + + oss.str()); + } + return std::unique_ptr(new PbMemory( memory_shm, data_ptr, opened_cuda_ipc_handle /* opened_cuda_ipc_handle */)); @@ -383,6 +461,18 @@ PbMemory::SetCudaIpcHandle(cudaIpcMemHandle_t* cuda_ipc_handle) { *(reinterpret_cast(ShmData())) = *(cuda_ipc_handle); } + +void +PbMemory::UpdateCUDAOffset(std::unique_ptr& cuda_pool) +{ + if (cuda_pool->UseCudaSharedPool(MemoryTypeId()) && + IsUsingCUDAPool(cuda_pool, MemoryTypeId(), DataPtr())) { + memory_shm_ptr_->cuda_pool_offset = + DataPtr() - + reinterpret_cast(cuda_pool->CUDAPoolAddress(MemoryTypeId())); + memory_shm_ptr_->use_cuda_shared_pool = true; + } +} #endif PbMemory::~PbMemory() diff --git a/src/pb_memory.h b/src/pb_memory.h index 61df7762..ad79daed 100644 --- a/src/pb_memory.h +++ b/src/pb_memory.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -42,13 +42,18 @@ namespace triton { namespace backend { namespace python { // struct MemoryShm { // If the memory type is a GPU pointer, the offset of the GPU pointer from the - // base address. For CPU memory type this field contains garbage data. + // base address. For CPU memory type this field contains garbage data. This + // field will only be used when the memory is not allocated from the CUDA + // shared memory pool. uint64_t gpu_pointer_offset; + bool use_cuda_shared_pool; + // The offset of the memory from the base address of the CUDA shared memory + // pool. + uint64_t cuda_pool_offset; TRITONSERVER_MemoryType memory_type; int64_t memory_type_id; uint64_t byte_size; - bool is_cuda_handle_set; uint64_t memory_release_id; }; @@ -60,6 +65,7 @@ class PbMemory { uint64_t byte_size, char* data, bool copy_gpu = true); static std::unique_ptr Create( + std::unique_ptr& shm_pool, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, uint64_t byte_size, char* data, char* data_shm, bi::managed_external_buffer::handle_t handle, bool copy_gpu = true); @@ -72,6 +78,8 @@ class PbMemory { #ifdef TRITON_ENABLE_GPU void SetCudaIpcHandle(cudaIpcMemHandle_t* cuda_ipc_handle); + + void UpdateCUDAOffset(std::unique_ptr& cuda_pool); #endif // Copy the destination buffer to the source buffer. @@ -83,6 +91,7 @@ class PbMemory { bi::managed_external_buffer::handle_t memory_handle, bool open_cuda_handle); static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t handle, char* data_shm, bool open_cuda_handle); static uint64_t ShmStructSize( @@ -117,8 +126,25 @@ class PbMemory { void SetMemoryReleaseCallback(std::function release_callback); + bool UseCUDASharedPool() const + { + return memory_shm_ptr_->use_cuda_shared_pool; + } + ~PbMemory(); +#ifndef TRITON_PB_STUB + void SetBackendMemory(std::unique_ptr&& backend_memory) + { + backend_memory_ = std::move(backend_memory); + }; + + std::unique_ptr GetBackendMemory() + { + return std::move(backend_memory_); + }; +#endif + private: AllocatedSharedMemory memory_shm_; MemoryShm* memory_shm_ptr_; @@ -137,7 +163,7 @@ class PbMemory { bool opened_cuda_ipc_handle_; #ifdef TRITON_ENABLE_GPU - /// Calculate the pointer offest from the base address. + /// Calculate the pointer offset from the base address. /// \return The offset of a device pointer. /// \throws PythonBackendException if the tensor is stored in CPU. uint64_t GetGPUPointerOffset(); @@ -150,6 +176,7 @@ class PbMemory { #endif static void FillShmData( + std::unique_ptr& cuda_pool, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, uint64_t byte_size, char* data, char* data_shm, bi::managed_external_buffer::handle_t handle, bool copy_gpu = true); diff --git a/src/pb_metric_reporter.h b/src/pb_metric_reporter.h index 88062f86..89c81b38 100644 --- a/src/pb_metric_reporter.h +++ b/src/pb_metric_reporter.h @@ -29,6 +29,7 @@ #include #include #include + #include "triton/core/tritonbackend.h" namespace triton { namespace backend { namespace python { diff --git a/src/pb_preferred_memory.h b/src/pb_preferred_memory.h index 55f4db89..c28f1b87 100644 --- a/src/pb_preferred_memory.h +++ b/src/pb_preferred_memory.h @@ -30,10 +30,10 @@ namespace triton { namespace backend { namespace python { class PreferredMemory { public: - enum MemoryType { GPU, CPU, DEFAULT }; + enum MemoryType { kGPU, kCPU, kDefault }; PreferredMemory() - : preferred_memory_type_(MemoryType::DEFAULT), preferred_device_id_(0) + : preferred_memory_type_(MemoryType::kDefault), preferred_device_id_(0) { } diff --git a/src/pb_response_iterator.cc b/src/pb_response_iterator.cc index 27a6c64b..536d4232 100644 --- a/src/pb_response_iterator.cc +++ b/src/pb_response_iterator.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,10 +25,12 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "pb_response_iterator.h" -#include -#include "pb_stub.h" #include + +#include + +#include "pb_stub.h" namespace py = pybind11; namespace triton { namespace backend { namespace python { @@ -38,6 +40,7 @@ ResponseIterator::ResponseIterator( : id_(response->Id()), is_finished_(false), is_cleared_(false), idx_(0) { response_buffer_.push(response); + pb_bls_cancel_ = std::make_shared(response->Id()); } ResponseIterator::~ResponseIterator() @@ -98,19 +101,17 @@ ResponseIterator::Next() } } -py::iterator +void ResponseIterator::Iter() { if (is_finished_) { // If the previous iteration is finished, reset the index so that it will - // iterator from the begining of the responses. Otherwise just resume the + // iterator from the beginning of the responses. Otherwise just resume the // iteration from the previous index. if (idx_ >= responses_.size()) { idx_ = 0; } } - - return py::cast(*this); } void @@ -133,7 +134,7 @@ void ResponseIterator::Clear() { std::unique_ptr& stub = Stub::GetOrCreateInstance(); - stub->EnqueueCleanupId(id_); + stub->EnqueueCleanupId(id_, PYTHONSTUB_BLSDecoupledInferPayloadCleanup); { std::lock_guard lock{mu_}; response_buffer_.push(DUMMY_MESSAGE); @@ -159,4 +160,12 @@ ResponseIterator::GetExistingResponses() return responses; } +void +ResponseIterator::Cancel() +{ + if (!is_finished_) { + pb_bls_cancel_->Cancel(); + } +} + }}} // namespace triton::backend::python diff --git a/src/pb_response_iterator.h b/src/pb_response_iterator.h index 98351369..cb26d6a3 100644 --- a/src/pb_response_iterator.h +++ b/src/pb_response_iterator.h @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,7 +27,9 @@ #pragma once #include + #include "infer_response.h" +#include "pb_bls_cancel.h" namespace triton { namespace backend { namespace python { @@ -37,11 +39,12 @@ class ResponseIterator { ~ResponseIterator(); std::shared_ptr Next(); - py::iterator Iter(); + void Iter(); void EnqueueResponse(std::shared_ptr infer_response); void* Id(); void Clear(); std::vector> GetExistingResponses(); + void Cancel(); private: std::vector> responses_; @@ -52,6 +55,7 @@ class ResponseIterator { bool is_finished_; bool is_cleared_; size_t idx_; + std::shared_ptr pb_bls_cancel_; }; }}} // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index f2bc4def..56048d78 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -28,7 +28,6 @@ #include #include -#include #include #include @@ -43,18 +42,26 @@ #include #include -#include "infer_response.h" +#include "correlation_id.h" +#include "model_loader.h" #include "pb_error.h" #include "pb_map.h" #include "pb_preferred_memory.h" #include "pb_response_iterator.h" #include "pb_string.h" +#include "pb_stub_log.h" #include "pb_utils.h" #include "response_sender.h" #include "scoped_defer.h" #include "shm_manager.h" #include "triton/common/nvtx.h" +#ifdef _WIN32 +#include // SIGINT & SIGTERM +#include +#else +#include +#endif #ifdef TRITON_ENABLE_GPU #include @@ -63,6 +70,10 @@ namespace py = pybind11; using namespace pybind11::literals; namespace bi = boost::interprocess; +#ifndef TRITON_ENABLE_GPU +using cudaStream_t = void*; +#endif + namespace triton { namespace backend { namespace python { std::atomic non_graceful_exit = {false}; @@ -73,17 +84,66 @@ SignalHandler(int signum) // Skip the SIGINT and SIGTERM } +template +PYTYPE +PyDefaultArgumentToMutableType(const py::object& argument) +{ + // The default argument on Python functions always reference the same copy, + // meaning if the default argument is changed by the function, then it is + // changed for all subsequent calls to the function. Thus, default arguments + // should be limited to basic types (i.e. None). This helper function returns + // an empty expected type, if the argument is None (i.e. default initialized). + // If the argument is neither None nor expected type, an exception is thrown. + if (py::isinstance(argument)) { + return PYTYPE(); + } + if (py::isinstance(argument)) { + return argument; + } + throw PythonBackendException( + std::string("Expect ") + typeid(PYTYPE).name() + ", got " + + std::string(py::str(argument.get_type()))); +} + +std::string +PyParametersToJSON(const py::dict& parameters) +{ + for (const auto& pair : parameters) { + if (!py::isinstance(pair.first)) { + throw PythonBackendException( + "Expect parameters keys to have type str, found type " + + std::string(py::str(pair.first.get_type()))); + } + if (!py::isinstance(pair.second) && + !py::isinstance(pair.second) && + !py::isinstance(pair.second)) { + throw PythonBackendException( + "Expect parameters values to have type bool/int/str, found type " + + std::string(py::str(pair.second.get_type()))); + } + } + py::module_ py_json = py::module_::import("json"); + std::string parameters_str = py::str(py_json.attr("dumps")(parameters)); + return parameters_str; +} + +void +AsyncEventFutureDoneCallback(const py::object& py_future) +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + stub->BackgroundFutureDone(py_future); +} + void Stub::Instantiate( int64_t shm_growth_size, int64_t shm_default_size, const std::string& shm_region_name, const std::string& model_path, const std::string& model_version, const std::string& triton_install_path, bi::managed_external_buffer::handle_t ipc_control_handle, - const std::string& name) + const std::string& name, const std::string& python_runtime_model) { - model_path_ = model_path; - model_version_ = model_version; - triton_install_path_ = triton_install_path; + model_context_.Init( + model_path, python_runtime_model, triton_install_path, model_version); name_ = name; health_mutex_ = nullptr; initialized_ = false; @@ -125,6 +185,7 @@ Stub::Instantiate( // interfere with the shared library resolution of other executable and // binaries. if (ipc_control_->uses_env) { +#ifndef _WIN32 char* ld_library_path = std::getenv("LD_LIBRARY_PATH"); if (ld_library_path != nullptr) { @@ -150,6 +211,11 @@ Stub::Instantiate( "When using an execution environment, LD_LIBRARY_PATH variable " "cannot be empty."); } +#else + throw PythonBackendException( + "Custom execution environments are not currently supported on " + "Windows."); +#endif } } catch (const PythonBackendException& pb_exception) { @@ -289,7 +355,7 @@ Stub::RunCommand() shm_pool_->Construct(); // The initialization is done in three steps. First the main process sends - // a message to the stub process asking to begin to initilize the Python + // a message to the stub process asking to begin to initialize the Python // model. After that is finished stub process sends a message to the // parent process that the initialization is finished. Finally, the // parent process sends a message to the stub process asking the stub @@ -340,15 +406,19 @@ Stub::RunCommand() shm_pool_->Load(ipc_message->Args()); RequestBatch* request_batch_shm_ptr = reinterpret_cast(request_batch.data_.get()); - if (!ipc_control_->decoupled) { - ProcessRequests(request_batch_shm_ptr); - } else { - ProcessRequestsDecoupled(request_batch_shm_ptr); - } + ProcessRequests(request_batch_shm_ptr); } break; case PYTHONSTUB_CommandType::PYTHONSTUB_FinalizeRequest: ipc_message->Command() = PYTHONSTUB_FinalizeResponse; + // Clean up response_iterator_map_ before sending sending message back to + // the parent process to make sure that the clean up message can be + // processed before the message queue is destroyed. + { + std::lock_guard lock(response_iterator_map_mu_); + std::unordered_map>().swap( + response_iterator_map_); + } SendIPCMessage(ipc_message); return true; // Terminate the stub process case PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers: @@ -356,9 +426,10 @@ Stub::RunCommand() LoadGPUBuffers(ipc_message); } catch (const PythonBackendException& pb_exception) { - LOG_INFO << "An error occurred while trying to load GPU buffers in the " - "Python backend stub: " - << pb_exception.what() << std::endl; + LOG_ERROR + << "An error occurred while trying to load GPU buffers in the " + "Python backend stub: " + << pb_exception.what() << std::endl; } break; @@ -374,30 +445,7 @@ Stub::StubSetup() { py::module sys = py::module_::import("sys"); - std::string model_name = - model_path_.substr(model_path_.find_last_of("/") + 1); - - // Model name without the .py extension - auto dotpy_pos = model_name.find_last_of(".py"); - if (dotpy_pos == std::string::npos || dotpy_pos != model_name.size() - 1) { - throw PythonBackendException( - "Model name must end with '.py'. Model name is \"" + model_name + - "\"."); - } - - // The position of last character of the string that is searched for is - // returned by 'find_last_of'. Need to manually adjust the position. - std::string model_name_trimmed = model_name.substr(0, dotpy_pos - 2); - std::string model_path_parent = - model_path_.substr(0, model_path_.find_last_of("/")); - std::string model_path_parent_parent = - model_path_parent.substr(0, model_path_parent.find_last_of("/")); - std::string python_backend_folder = triton_install_path_; - sys.attr("path").attr("append")(model_path_parent); - sys.attr("path").attr("append")(model_path_parent_parent); - sys.attr("path").attr("append")(python_backend_folder); - sys = py::module_::import( - (std::string(model_version_) + "." + model_name_trimmed).c_str()); + model_context_.StubSetup(sys); py::module python_backend_utils = py::module_::import("triton_python_backend_utils"); @@ -428,6 +476,18 @@ Stub::StubSetup() py::setattr( python_backend_utils, "TRITONSERVER_MEMORY_CPU", c_python_backend_utils.attr("TRITONSERVER_MEMORY_CPU")); + py::setattr( + python_backend_utils, "MetricFamily", + c_python_backend_utils.attr("MetricFamily")); + py::setattr( + python_backend_utils, "load_model", + c_python_backend_utils.attr("load_model")); + py::setattr( + python_backend_utils, "unload_model", + c_python_backend_utils.attr("unload_model")); + py::setattr( + python_backend_utils, "is_model_ready", + c_python_backend_utils.attr("is_model_ready")); c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); @@ -451,6 +511,13 @@ Stub::AutoCompleteModelConfig( py::module_::import("triton_python_backend_utils"); py::object model_config = python_backend_utils.attr("ModelConfig")(pb_string_shm->String()); + python_backend_utils.def( + "get_model_dir", + []() { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + return stub->GetModelDir(); + }, + py::return_value_policy::reference); if (py::hasattr(sys.attr("TritonPythonModel"), "auto_complete_config")) { model_config = sys.attr("TritonPythonModel") @@ -492,9 +559,19 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) c_python_backend_utils.attr("InferenceResponse")); c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); + async_event_loop_ = py::none(); + background_futures_ = py::set(); + py::object TritonPythonModel = sys.attr("TritonPythonModel"); deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor"); serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor"); + python_backend_utils.def( + "get_model_dir", + []() { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + return stub->GetModelDir(); + }, + py::return_value_policy::reference); model_instance_ = TritonPythonModel(); std::unordered_map map; @@ -522,55 +599,50 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) } void -Stub::ProcessResponse(InferResponse* response) +Stub::LoadGPUBuffers(std::unique_ptr& ipc_message) { - response->SaveToSharedMemory(shm_pool_, false /* copy_gpu */); + ScopedDefer load_gpu_buffer_response([this] { + // LoadGPUBuffers must let the parent process know when loading the + // buffers have been finished. + parent_message_queue_->Push(DUMMY_MESSAGE); + gpu_tensors_.clear(); + }); - for (auto& output_tensor : response->OutputTensors()) { - if (!output_tensor->IsCPU()) { - gpu_tensors_.push_back(output_tensor); - } + AllocatedSharedMemory gpu_buffers_handle = + shm_pool_->Load(ipc_message->Args()); + + if (!gpu_buffers_handle.data_->success) { + std::unique_ptr error = PbString::LoadFromSharedMemory( + shm_pool_, gpu_buffers_handle.data_->error); + throw PythonBackendException( + "Failed to load GPU buffers: " + error->String()); } -} -void -Stub::LoadGPUBuffers(std::unique_ptr& ipc_message) -{ - AllocatedSharedMemory gpu_buffers_handle = - shm_pool_->Load(ipc_message->Args()); + uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count; + AllocatedSharedMemory + gpu_buffers_handle_shm = + shm_pool_->Load( + gpu_buffers_handle.data_->buffers); - uint64_t* gpu_buffer_count = - reinterpret_cast(gpu_buffers_handle.data_.get()); - bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = - reinterpret_cast( - gpu_buffers_handle.data_.get() + sizeof(uint64_t)); - - if (gpu_tensors_.size() != *gpu_buffer_count) { - LOG_INFO - << (std::string( - "GPU buffers size does not match the provided buffers: ") + - std::to_string(gpu_tensors_.size()) + - " != " + std::to_string(*gpu_buffer_count)); - return; + if (gpu_tensors_.size() != gpu_buffer_count) { + throw PythonBackendException( + std::string("GPU buffers size does not match the provided buffers: ") + + std::to_string(gpu_tensors_.size()) + + " != " + std::to_string(gpu_buffer_count)); } std::vector> dst_buffers; - for (size_t i = 0; i < gpu_tensors_.size(); i++) { std::unique_ptr dst_buffer = PbMemory::LoadFromSharedMemory( - shm_pool_, gpu_buffers_handle_shm[i], true /* open_cuda_handle */); + shm_pool_, gpu_buffers_handle_shm.data_.get()[i], + true /* open_cuda_handle */); dst_buffers.emplace_back(std::move(dst_buffer)); } - ScopedDefer load_gpu_buffer_response( - [this] { parent_message_queue_->Push(DUMMY_MESSAGE); }); - for (size_t i = 0; i < gpu_tensors_.size(); i++) { std::shared_ptr& src_buffer = gpu_tensors_[i]; PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory()); } - - gpu_tensors_.clear(); } py::list @@ -591,7 +663,8 @@ Stub::LoadRequestsFromSharedMemory(RequestBatch* request_batch_shm_ptr) for (size_t i = 0; i < batch_size; i++) { std::shared_ptr infer_request = InferRequest::LoadFromSharedMemory( - shm_pool_, request_shm_handle[i], true /* open_cuda_handle */); + shm_pool_, request_shm_handle[i], true /* open_cuda_handle */, + &ipc_control_->decoupled /* is_model_decoupled */); py_request_list.append(infer_request); } @@ -599,33 +672,26 @@ Stub::LoadRequestsFromSharedMemory(RequestBatch* request_batch_shm_ptr) } void -Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) +Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) { py::list py_request_list = LoadRequestsFromSharedMemory(request_batch_shm_ptr); - std::unique_ptr execute_response = - IPCMessage::Create(shm_pool_, false /* Inline response */); - execute_response->Command() = PYTHONSTUB_ExecuteResponse; + std::unique_ptr execute_response; - AllocatedSharedMemory response_batch = - shm_pool_->Construct(); - ResponseBatch* response_batch_shm_ptr = - reinterpret_cast(response_batch.data_.get()); - execute_response->Args() = response_batch.handle_; + std::optional> response_batch; bool has_exception = false; std::string error_string; std::unique_ptr error_string_shm; + std::string err_message; ScopedDefer execute_finalize([this] { stub_message_queue_->Pop(); }); ScopedDefer _( [this, &execute_response] { SendIPCMessage(execute_response); }); - + py::object execute_return; + py::object coroutine_return; try { - response_batch_shm_ptr->has_error = false; - response_batch_shm_ptr->is_error_set = false; - if (!py::hasattr(model_instance_, "execute")) { - std::string message = "Python model " + model_path_ + + std::string message = "Python model " + model_context_.PythonModelPath() + " does not implement `execute` method."; throw PythonBackendException(message); } @@ -633,13 +699,24 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) { NVTX_RANGE(nvtx_, "PyExecute " + name_); - py::object execute_return = - model_instance_.attr("execute")(py_request_list); - if (!py::isinstance(execute_return)) { - throw PythonBackendException( - "Python model '" + name_ + - "' is using the decoupled mode and the execute function must " - "return None."); + execute_return = model_instance_.attr("execute")(py_request_list); + + bool is_coroutine = py::module::import("asyncio") + .attr("iscoroutine")(execute_return) + .cast(); + if (is_coroutine) { + if (IsDecoupled()) { + // Do not wait for async decoupled execute to return. + RunCoroutine(execute_return, true /* in_background */); + } else { + coroutine_return = + RunCoroutine(execute_return, false /* in_background */); + ProcessReturnedResponses( + py_request_list, coroutine_return, response_batch); + } + } else { + ProcessReturnedResponses( + py_request_list, execute_return, response_batch); } } } @@ -653,152 +730,249 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) } if (has_exception) { - std::string err_message = - std::string( - "Failed to process the request(s) for model '" + name_ + - "', message: ") + - error_string; - LOG_INFO << err_message.c_str(); + err_message = std::string( + "Failed to process the request(s) for model '" + name_ + + "', message: ") + + error_string; + LOG_ERROR << err_message.c_str(); + if (!response_batch) { + response_batch = shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(IPCMessageShm)); + } + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + + // The backend will clean up the response factory if there is an error in + // the response batch. For decoupled mode, it is necessary to handle cases + // where the response sender should have already cleaned up, ensuring the + // backend does not delete the response factory again during error handling. + if (IsDecoupled()) { + for (py::handle py_request : py_request_list) { + InferRequest* request = py_request.cast(); + if (request->GetResponseSender()->IsClosed()) { + response_batch_shm_ptr->is_response_factory_deleted = true; + } + } + } + response_batch_shm_ptr->has_error = true; - error_string_shm = PbString::Create(shm_pool_, error_string); + error_string_shm = PbString::Create(shm_pool_, err_message); response_batch_shm_ptr->error = error_string_shm->ShmHandle(); response_batch_shm_ptr->is_error_set = true; + response_batch_shm_ptr->batch_size = 0; + // Once the error is sent to the backend, the backend is supposed to close + // all response factories if not already closed, so closing all response + // senders if not already closed to prevent the model from sending more + // responses after the factories are closed. + for (py::handle py_request : py_request_list) { + InferRequest* request = py_request.cast(); + request->GetResponseSender()->Close(); + } + } else { + if (!response_batch) { + response_batch = shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(IPCMessageShm)); + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + response_batch_shm_ptr->batch_size = 0; + } + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + response_batch_shm_ptr->has_error = false; + response_batch_shm_ptr->is_error_set = false; } + + execute_response = IPCMessage::Create( + reinterpret_cast(response_batch.value().data_.get()), + response_batch.value().handle_); + execute_response->Args() = + response_batch.value().handle_ + sizeof(IPCMessageShm); + execute_response->InlineResponse() = false; + execute_response->Command() = PYTHONSTUB_ExecuteResponse; + _.Complete(); + execute_finalize.Complete(); } void -Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) +Stub::ProcessResponse(InferResponse* response) { - std::unique_ptr execute_response = - IPCMessage::Create(shm_pool_, false /* Inline response */); - execute_response->Command() = PYTHONSTUB_ExecuteResponse; - - AllocatedSharedMemory response_batch = shm_pool_->Construct( - request_batch_shm_ptr->batch_size * - sizeof(bi::managed_external_buffer::handle_t) + - sizeof(ResponseBatch)); - ResponseBatch* response_batch_shm_ptr = - reinterpret_cast(response_batch.data_.get()); - - std::unique_ptr error_string_shm; - py::list inference_responses; - - bi::managed_external_buffer::handle_t* responses_shm_handle = - reinterpret_cast( - response_batch.data_.get() + sizeof(ResponseBatch)); - - py::list responses; - - // Notifying the stub should be after responses. - ScopedDefer execute_finalize([this] { stub_message_queue_->Pop(); }); - ScopedDefer _( - [this, &execute_response] { SendIPCMessage(execute_response); }); - - execute_response->Args() = response_batch.handle_; - - bool has_exception = false; - std::string error_string; - try { - response_batch_shm_ptr->has_error = false; - response_batch_shm_ptr->is_error_set = false; - - uint32_t batch_size = request_batch_shm_ptr->batch_size; - - if (batch_size == 0) { - return; - } - - py::list py_request_list = - LoadRequestsFromSharedMemory(request_batch_shm_ptr); + response->SaveToSharedMemory(shm_pool_, false /* copy_gpu */); - if (!py::hasattr(model_instance_, "execute")) { - std::string message = "Python model " + model_path_ + - " does not implement `execute` method."; - throw PythonBackendException(message); + for (auto& output_tensor : response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + gpu_tensors_.push_back(output_tensor); } + } +} - py::object request_list = py_request_list; - py::module asyncio = py::module::import("asyncio"); +void +Stub::ProcessReturnedResponses( + py::list py_requests, py::object py_responses_obj, + std::optional>& response_batch) +{ + // Return if there is nothing to process. + if (py::isinstance(py_responses_obj)) { + return; + } + // Only non-decoupled may return responses. + if (IsDecoupled()) { + throw PythonBackendException( + "Python model '" + name_ + + "' is using the decoupled mode and the execute function must return " + "None."); + } + // Check responses is a list. + if (!py::isinstance(py_responses_obj)) { + throw PythonBackendException( + "Expected a list in the execute return, found type '" + + std::string(py::str(py_responses_obj.get_type())) + "'."); + } + py::list py_responses = py_responses_obj; + // Responses and requests length must match. + size_t requests_size = py::len(py_requests); + size_t responses_size = py::len(py_responses); + if (requests_size != responses_size) { + throw PythonBackendException( + "Number of InferenceResponse objects do not match the number of " + "InferenceRequest objects. InferenceRequest(s) size is:" + + std::to_string(requests_size) + ", and InferenceResponse(s) size is:" + + std::to_string(responses_size) + "\n"); + } - // Execute Response - py::object execute_return; - py::object responses_obj; - bool is_coroutine; + for (size_t i = 0; i < responses_size; i++) { + if (!py::isinstance(py_responses[i])) { + InferRequest* request = py_requests[i].cast(); + // Response must be None if rescheduled. + if (request->ReleaseFlags() == TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + throw PythonBackendException( + "Expected a None object in the execute function return list for " + "reschduled request, found type '" + + std::string(py::str(py_responses[i].get_type())) + "'."); + } + // Send the response. + if (!py::isinstance(py_responses[i])) { + throw PythonBackendException( + "Expected an 'InferenceResponse' object in the execute function " + "return list, found type '" + + std::string(py::str(py_responses[i].get_type())) + "'."); + } - { - NVTX_RANGE(nvtx_, "PyExecute " + name_); - execute_return = model_instance_.attr("execute")(request_list); - is_coroutine = asyncio.attr("iscoroutine")(execute_return).cast(); + InferResponse* response = py_responses[i].cast(); + try { + request->GetResponseSender()->UpdateStateAndCounters( + response, TRITONSERVER_RESPONSE_COMPLETE_FINAL); + } + catch (const PythonBackendException& pb_exception) { + // Handle the exception here to catch the error when there's a response + // returned from `execute()`. + if (request->GetResponseSender()->IsClosed()) { + response_batch = std::move(shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(IPCMessageShm))); + ResponseBatch* response_batch_shm_ptr = + reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + response_batch_shm_ptr->batch_size = 0; + response_batch_shm_ptr->is_response_factory_deleted = true; + } + throw pb_exception; + } } + } + // Return all the created responses using response_batch. The reason + // that both of the paths are available is that sending the responses + // using response_batch is faster than using `response_sender`. + response_batch = std::move(shm_pool_->Construct( + sizeof(IPCMessageShm) + + requests_size * sizeof(bi::managed_external_buffer::handle_t) + + sizeof(ResponseBatch))); + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); - if (is_coroutine) { - responses_obj = asyncio.attr("run")(execute_return); + bi::managed_external_buffer::handle_t* responses_shm_handle = + reinterpret_cast( + response_batch.value().data_.get() + sizeof(ResponseBatch) + + sizeof(IPCMessageShm)); + for (size_t i = 0; i < responses_size; i++) { + // Check the return type of execute function. + InferRequest* infer_request = py_requests[i].cast(); + InferResponse* infer_response = py_responses[i].cast(); + if (!py::isinstance(py_responses[i])) { + infer_response->PruneOutputTensors(infer_request->RequestedOutputNames()); + ProcessResponse(infer_response); + responses_shm_handle[i] = infer_response->ShmHandle(); } else { - responses_obj = execute_return; + responses_shm_handle[i] = 0; } + } + response_batch_shm_ptr->batch_size = requests_size; +} - // Check the return type of execute function. - if (!py::isinstance(responses_obj)) { - std::string str = py::str(execute_return.get_type()); - throw PythonBackendException( - std::string("Expected a list in the execute return, found type '") + - str + "'."); - } +py::object +Stub::GetAsyncEventLoop() +{ + if (py::isinstance(async_event_loop_)) { + // Create the event loop if not already. + py::module asyncio = py::module_::import("asyncio"); + async_event_loop_ = asyncio.attr("new_event_loop")(); + asyncio.attr("set_event_loop")(async_event_loop_); + py::object py_thread = + py::module_::import("threading") + .attr("Thread")( + "target"_a = async_event_loop_.attr("run_forever"), + "daemon"_a = true); + py_thread.attr("start")(); + } + return async_event_loop_; +} - responses = responses_obj; - size_t response_size = py::len(responses); - - // If the number of request objects do not match the number of - // response objects throw an error. - if (response_size != batch_size) { - std::string err = - "Number of InferenceResponse objects do not match the number " - "of " - "InferenceRequest objects. InferenceRequest(s) size is:" + - std::to_string(batch_size) + ", and InferenceResponse(s) size is:" + - std::to_string(response_size) + "\n"; - throw PythonBackendException(err); +py::object +Stub::RunCoroutine(py::object coroutine, bool in_background) +{ + py::object loop = GetAsyncEventLoop(); + py::object py_future = py::module_::import("asyncio").attr( + "run_coroutine_threadsafe")(coroutine, loop); + if (in_background) { + py_future.attr("add_done_callback")( + py::module_::import("c_python_backend_utils") + .attr("async_event_future_done_callback")); + background_futures_.attr("add")(py_future); + return py::none(); + } + return py_future.attr("result")(); +} + +void +Stub::BackgroundFutureDone(const py::object& py_future) +{ + ScopedDefer _([this, &py_future] { + // Remove future from background + try { + background_futures_.attr("remove")(py_future); } - for (auto& response : responses) { - // Check the return type of execute function. - if (!py::isinstance(response)) { - std::string str = py::str(response.get_type()); - throw PythonBackendException( - std::string("Expected an 'InferenceResponse' object in the execute " - "function return list, found type '") + - str + "'."); - } + catch (const py::error_already_set& error) { + LOG_ERROR << "Cannot remove future from background; " << error.what(); } - response_batch_shm_ptr->batch_size = response_size; - - for (size_t i = 0; i < batch_size; i++) { - InferResponse* infer_response = responses[i].cast(); - InferRequest* infer_request = py_request_list[i].cast(); - infer_response->PruneOutputTensors(infer_request->RequestedOutputNames()); - - ProcessResponse(infer_response); - responses_shm_handle[i] = infer_response->ShmHandle(); + }); + // TODO: Why using `py_future.result()` with error hangs on exit? + try { + py::object exception = py_future.attr("exception")(); + if (!py::isinstance(exception)) { + std::string err_msg = ""; + py::object traceback = py::module_::import("traceback") + .attr("TracebackException") + .attr("from_exception")(exception) + .attr("format")(); + for (py::handle line : traceback) { + err_msg += py::str(line); + } + LOG_ERROR << err_msg; } } catch (const PythonBackendException& pb_exception) { - has_exception = true; - error_string = pb_exception.what(); + LOG_ERROR << pb_exception.what(); } catch (const py::error_already_set& error) { - has_exception = true; - error_string = error.what(); - } - - if (has_exception) { - std::string err_message = - std::string( - "Failed to process the request(s) for model '" + name_ + - "', message: ") + - error_string; - error_string_shm = PbString::Create(shm_pool_, error_string); - response_batch_shm_ptr->has_error = true; - response_batch_shm_ptr->is_error_set = true; - response_batch_shm_ptr->error = error_string_shm->ShmHandle(); + LOG_ERROR << error.what(); } } @@ -813,15 +987,35 @@ void Stub::Finalize() { finalizing_ = true; - // Call finalize if exists. - if (initialized_ && py::hasattr(model_instance_, "finalize")) { - try { - model_instance_.attr("finalize")(); + if (initialized_) { + // Stop async event loop if created. + if (!py::isinstance(async_event_loop_)) { + async_event_loop_.attr("stop")(); } - catch (const py::error_already_set& e) { - LOG_INFO << e.what(); + // Call finalize if exists. + if (py::hasattr(model_instance_, "finalize")) { + try { + model_instance_.attr("finalize")(); + } + catch (const py::error_already_set& e) { + LOG_INFO << e.what(); + } } } +#ifdef TRITON_ENABLE_GPU + // We also need to destroy created proxy CUDA streams for dlpack, if any + std::lock_guard lock(dlpack_proxy_stream_pool_mu_); + for (auto& entry : dlpack_proxy_stream_pool_) { + // We don't need to switch device to destroy a stream + // https://stackoverflow.com/questions/64663943/how-to-destroy-a-stream-that-was-created-on-a-specific-device + cudaError_t err = cudaStreamDestroy(entry.second); + if (err != cudaSuccess) { + LOG_ERROR + << "Failed to destroy dlpack CUDA proxy stream on device with id " + + std::to_string(entry.first); + } + } +#endif } void @@ -844,11 +1038,31 @@ Stub::SendIPCUtilsMessage(std::unique_ptr& ipc_message) Stub::~Stub() { - { +#ifdef TRITON_ENABLE_GPU + try { + if (shm_pool_ != nullptr) { + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + for (auto& m : + shm_pool_->GetCUDAMemoryPoolManager()->CUDAPoolAddressMap()) { + if (m.second != nullptr) { + cuda_api.CloseCudaHandle(m.first, m.second); + } + } + } + } + catch (const PythonBackendException& pb_exception) { + std::cerr << "Error when closing CUDA handle: " << pb_exception.what(); + } +#endif + + // Ensure the interpreter is active before trying to clean up. + if (Py_IsInitialized()) { py::gil_scoped_acquire acquire; - model_instance_ = py::none(); + py::object async_event_loop_local(std::move(async_event_loop_)); + py::object background_futures_local(std::move(background_futures_)); + py::object model_instance_local(std::move(model_instance_)); } - stub_instance_.reset(); + stub_message_queue_.reset(); parent_message_queue_.reset(); stub_to_parent_mq_.reset(); @@ -882,8 +1096,8 @@ Stub::TerminateStubToParentQueueMonitor() Logger::GetOrCreateInstance()->SetBackendLoggingActive(false); { std::lock_guard guard{stub_to_parent_message_mu_}; - log_request_buffer_.push(DUMMY_MESSAGE); - bls_response_cleanup_buffer_.push(DUMMY_MESSAGE); + // Push a dummy message to signal the thread to terminate. + stub_to_parent_buffer_.push(DUMMY_MESSAGE); } stub_to_parent_message_cv_.notify_one(); stub_to_parent_queue_monitor_.join(); @@ -892,11 +1106,10 @@ Stub::TerminateStubToParentQueueMonitor() void Stub::EnqueueLogRequest(std::unique_ptr& log_ptr) { - { - std::lock_guard guard{stub_to_parent_message_mu_}; - log_request_buffer_.push(std::move(log_ptr)); - } - stub_to_parent_message_cv_.notify_one(); + std::unique_ptr utils_msg_payload = + std::make_unique( + PYTHONSTUB_LogRequest, reinterpret_cast(log_ptr.release())); + EnqueueUtilsMessage(std::move(utils_msg_payload)); } void @@ -904,43 +1117,46 @@ Stub::ServiceStubToParentRequests() { while (stub_to_parent_thread_) { std::unique_lock guard{stub_to_parent_message_mu_}; - while (log_request_buffer_.empty() && - bls_response_cleanup_buffer_.empty()) { + while (stub_to_parent_buffer_.empty()) { stub_to_parent_message_cv_.wait(guard); } - if (!log_request_buffer_.empty()) { - // On exit, will send messages until - // DUMMY_MESSAGE is reached - std::unique_ptr log_request = - std::move(log_request_buffer_.front()); - if (log_request == DUMMY_MESSAGE) { - log_request_buffer_.pop(); - break; - } else { - log_request_buffer_.pop(); - SendLogMessage(log_request); - } - } - if (!bls_response_cleanup_buffer_.empty()) { - void* id = std::move(bls_response_cleanup_buffer_.front()); - if (id == DUMMY_MESSAGE) { - bls_response_cleanup_buffer_.pop(); - break; + // On exit, will send messages to the parent process until + // DUMMY_MESSAGE is reached + std::unique_ptr utils_msg_payload = + std::move(stub_to_parent_buffer_.front()); + if (utils_msg_payload == DUMMY_MESSAGE) { + stub_to_parent_buffer_.pop(); + break; + } else { + stub_to_parent_buffer_.pop(); + if (utils_msg_payload->command_type == PYTHONSTUB_LogRequest) { + SendLogMessage(utils_msg_payload); + } else if ( + (utils_msg_payload->command_type == + PYTHONSTUB_BLSDecoupledInferPayloadCleanup) || + (utils_msg_payload->command_type == + PYTHONSTUB_DecoupledResponseFactoryCleanup)) { + SendCleanupId(utils_msg_payload, utils_msg_payload->command_type); + } else if ( + utils_msg_payload->command_type == PYTHONSTUB_IsRequestCancelled) { + SendIsCancelled(utils_msg_payload); + } else if ( + utils_msg_payload->command_type == PYTHONSTUB_CancelBLSInferRequest) { + SendCancelBLSRequest(utils_msg_payload); } else { - bls_response_cleanup_buffer_.pop(); - { - std::lock_guard lock(response_iterator_map_mu_); - response_iterator_map_.erase(id); - } - SendCleanupId(id); + std::cerr << "Error when sending message via stub_to_parent message " + "buffer - unknown command\n"; } } } } void -Stub::SendLogMessage(std::unique_ptr& log_send_message) +Stub::SendLogMessage(std::unique_ptr& utils_msg_payload) { + std::unique_ptr log_send_message = std::unique_ptr( + reinterpret_cast(utils_msg_payload->utils_message_ptr)); + std::unique_ptr log_request_shm = PbLogShm::Create( shm_pool_, log_send_message->Filename(), log_send_message->Line(), log_send_message->Message(), log_send_message->Level()); @@ -969,11 +1185,19 @@ Stub::SendLogMessage(std::unique_ptr& log_send_message) } void -Stub::SendCleanupId(void* id) +Stub::SendCleanupId( + std::unique_ptr& utils_msg_payload, + const PYTHONSTUB_CommandType& command_type) { + void* id = utils_msg_payload->utils_message_ptr; + if (command_type == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { + std::lock_guard lock(response_iterator_map_mu_); + response_iterator_map_.erase(id); + } + std::unique_ptr ipc_message = IPCMessage::Create(shm_pool_, true /* inline_response */); - ipc_message->Command() = PYTHONSTUB_CleanupRequest; + ipc_message->Command() = command_type; AllocatedSharedMemory cleanup_request_message = shm_pool_->Construct( sizeof(CleanupMessage) + @@ -995,15 +1219,91 @@ Stub::SendCleanupId(void* id) } void -Stub::EnqueueCleanupId(void* id) +Stub::EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type) { if (id != nullptr) { - { - std::lock_guard guard{stub_to_parent_message_mu_}; - bls_response_cleanup_buffer_.push(id); + std::unique_ptr utils_msg_payload = + std::make_unique(command_type, id); + EnqueueUtilsMessage(std::move(utils_msg_payload)); + } +} + +void +Stub::SendCancelBLSRequest( + std::unique_ptr& utils_msg_payload) +{ + PbBLSCancel* pb_bls_cancel = + reinterpret_cast(utils_msg_payload->utils_message_ptr); + pb_bls_cancel->SaveToSharedMemory(shm_pool_); + + CancelBLSRequestMessage* message_payload = pb_bls_cancel->ShmPayload(); + std::unique_ptr ipc_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + ipc_message->Command() = utils_msg_payload->command_type; + ipc_message->Args() = pb_bls_cancel->ShmHandle(); + + bool is_cancelled = false; + { + bi::scoped_lock lk(message_payload->mu); + + SendIPCUtilsMessage(ipc_message); + while (!message_payload->waiting_on_stub) { + message_payload->cv.wait(lk); + } + + is_cancelled = message_payload->is_cancelled; + message_payload->waiting_on_stub = false; + message_payload->cv.notify_all(); + } + pb_bls_cancel->ReportIsCancelled(is_cancelled); +} + +void +Stub::EnqueueCancelBLSRequest(PbBLSCancel* pb_bls_cancel) +{ + std::unique_ptr utils_msg_payload = + std::make_unique( + PYTHONSTUB_CancelBLSInferRequest, + reinterpret_cast(pb_bls_cancel)); + EnqueueUtilsMessage(std::move(utils_msg_payload)); +} + +void +Stub::EnqueueIsCancelled(PbCancel* pb_cancel) +{ + std::unique_ptr utils_msg_payload = + std::make_unique( + PYTHONSTUB_IsRequestCancelled, reinterpret_cast(pb_cancel)); + EnqueueUtilsMessage(std::move(utils_msg_payload)); +} + +void +Stub::SendIsCancelled(std::unique_ptr& utils_msg_payload) +{ + PbCancel* pb_cancel = + reinterpret_cast(utils_msg_payload->utils_message_ptr); + pb_cancel->SaveToSharedMemory(shm_pool_); + + IsCancelledMessage* message_payload = pb_cancel->ShmPayload(); + std::unique_ptr ipc_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + ipc_message->Command() = utils_msg_payload->command_type; + ipc_message->Args() = pb_cancel->ShmHandle(); + + bool is_cancelled = false; + { + bi::scoped_lock lk(message_payload->mu); + + SendIPCUtilsMessage(ipc_message); + while (!message_payload->waiting_on_stub) { + message_payload->cv.wait(lk); } - stub_to_parent_message_cv_.notify_one(); + + is_cancelled = message_payload->is_cancelled; + message_payload->waiting_on_stub = false; + message_payload->cv.notify_all(); } + pb_cancel->ReportIsCancelled(is_cancelled); } bool @@ -1025,6 +1325,7 @@ Stub::TerminateParentToStubQueueMonitor() { if (parent_to_stub_thread_) { parent_to_stub_thread_ = false; + // Push a dummy message to signal the thread to terminate. parent_to_stub_mq_->Push(DUMMY_MESSAGE); parent_to_stub_queue_monitor_.join(); } @@ -1039,89 +1340,18 @@ Stub::ParentToStubMQMonitor() break; } - std::unique_ptr ipc_message; - ResponseBatch* response_batch = nullptr; - bi::managed_external_buffer::handle_t* response_handle = nullptr; - std::unique_ptr infer_response; - bool responses_is_set = false; - PythonBackendException pb_exception(std::string{}); - - try { - ipc_message = IPCMessage::LoadFromSharedMemory(shm_pool_, handle); - AllocatedSharedMemory response_batch_shm = - shm_pool_->Load(ipc_message->Args()); - response_batch = - reinterpret_cast(response_batch_shm.data_.get()); - response_handle = - reinterpret_cast( - response_batch_shm.data_.get() + sizeof(ResponseBatch)); - responses_is_set = true; - - if (response_batch->has_error) { - if (response_batch->is_error_set) { - std::unique_ptr pb_string = - PbString::LoadFromSharedMemory(shm_pool_, response_batch->error); - infer_response = std::make_unique( - std::vector>{}, - std::make_shared(pb_string->String())); - } else { - infer_response = std::make_unique( - std::vector>{}, - std::make_shared( - "An error occurred while performing BLS request.")); - } - } - - if (responses_is_set) { - infer_response = InferResponse::LoadFromSharedMemory( - shm_pool_, *response_handle, true /* open cuda handle */); - - for (auto& output_tensor : infer_response->OutputTensors()) { - if (!output_tensor->IsCPU()) { - uint64_t memory_release_id = - output_tensor->Memory()->MemoryReleaseId(); - output_tensor->Memory()->SetMemoryReleaseCallback( - [this, memory_release_id]() { - this->MemoryManagerQueue()->Push(memory_release_id); - }); - } - } - } else { - infer_response = std::make_unique( - std::vector>{}, - std::make_shared( - "An error occurred while performing BLS request.")); - } - } - catch (const PythonBackendException& pb_exception) { - infer_response = std::make_unique( - std::vector>{}, - std::make_shared(pb_exception.what())); - } - - { - std::lock_guard lock(response_iterator_map_mu_); - if (response_iterator_map_.find(infer_response->Id()) != - response_iterator_map_.end()) { - response_iterator_map_[infer_response->Id()]->EnqueueResponse( - std::move(infer_response)); - } else { - auto response_iterator = - std::make_shared(std::move(infer_response)); - response_iterator_map_.insert( - std::pair>( - response_iterator->Id(), response_iterator)); - } - } - - { - bi::scoped_lock lock{ - *(ipc_message->ResponseMutex())}; - response_batch->waiting_on_stub = true; - ipc_message->ResponseCondition()->notify_all(); - while (response_batch->waiting_on_stub) { - ipc_message->ResponseCondition()->wait(lock); - } + std::unique_ptr ipc_message = + IPCMessage::LoadFromSharedMemory(shm_pool_, handle); + + switch (ipc_message->Command()) { + case PYTHONSTUB_CommandType::PYTHONSTUB_CUDAPoolInitializeRequest: { + GetCUDAMemoryPoolAddress(ipc_message); + } break; + case PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecResponse: { + ProcessBLSResponseDecoupled(ipc_message); + } break; + default: + break; } } } @@ -1171,130 +1401,230 @@ Stub::IsFinalizing() return finalizing_; } -std::unique_ptr Logger::log_instance_; - -std::unique_ptr& -Logger::GetOrCreateInstance() -{ - if (Logger::log_instance_.get() == nullptr) { - Logger::log_instance_ = std::make_unique(); - } - - return Logger::log_instance_; -} - -// Bound function, called from the python client void -Logger::Log(const std::string& message, LogLevel level) +Stub::EnqueueUtilsMessage( + std::unique_ptr utils_msg_payload) { - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - py::object frame = py::module_::import("inspect").attr("currentframe"); - py::object caller_frame = frame(); - py::object info = py::module_::import("inspect").attr("getframeinfo"); - py::object caller_info = info(caller_frame); - py::object filename_python = caller_info.attr("filename"); - std::string filename = filename_python.cast(); - py::object lineno = caller_info.attr("lineno"); - uint32_t line = lineno.cast(); - - if (!stub->StubToParentServiceActive()) { - Logger::GetOrCreateInstance()->Log(filename, line, level, message); - } else { - std::unique_ptr log_msg(new PbLog(filename, line, message, level)); - stub->EnqueueLogRequest(log_msg); + { + std::lock_guard guard{stub_to_parent_message_mu_}; + stub_to_parent_buffer_.push(std::move(utils_msg_payload)); } + stub_to_parent_message_cv_.notify_one(); } -// Called internally (.e.g. LOG_ERROR << "Error"; ) -void -Logger::Log( - const std::string& filename, uint32_t lineno, LogLevel level, - const std::string& message) +cudaStream_t +Stub::GetProxyStream(const int& device_id) { - // If the log monitor service is not active yet, format - // and pass messages to cerr - if (!BackendLoggingActive()) { - std::string path(filename); - size_t pos = path.rfind('/'); - if (pos != std::string::npos) { - path = path.substr(pos + 1, std::string::npos); +#ifdef TRITON_ENABLE_GPU + std::lock_guard lock(dlpack_proxy_stream_pool_mu_); + if (dlpack_proxy_stream_pool_.find(device_id) == + dlpack_proxy_stream_pool_.end()) { + cudaStream_t new_proxy_stream; + cudaError_t err = cudaStreamCreate(&new_proxy_stream); + if (err == cudaSuccess) { + dlpack_proxy_stream_pool_.emplace(device_id, new_proxy_stream); + return new_proxy_stream; + } else { + throw PythonBackendException( + "Failed to create a CUDA stream for a DLPack call."); } - std::stringstream ss; - struct timeval tv; - gettimeofday(&tv, NULL); - struct tm tm_time; - gmtime_r(((time_t*)&(tv.tv_sec)), &tm_time); - ss << LeadingLogChar(level) << std::setfill('0') << std::setw(2) - << (tm_time.tm_mon + 1) << std::setw(2) << tm_time.tm_mday << " " - << std::setw(2) << tm_time.tm_hour << ':' << std::setw(2) - << tm_time.tm_min << ':' << std::setw(2) << tm_time.tm_sec << "." - << std::setw(6) << tv.tv_usec << ' ' << static_cast(getpid()) - << ' ' << path << ':' << lineno << "] "; - std::cerr << ss.str() << " " << message << std::endl; - } else { - // Ensure we do not create a stub instance before it has initialized - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - std::unique_ptr log_msg(new PbLog(filename, lineno, message, level)); - stub->EnqueueLogRequest(log_msg); } + return dlpack_proxy_stream_pool_[device_id]; +#else + return nullptr; +#endif } void -Logger::LogInfo(const std::string& message) +Stub::GetCUDAMemoryPoolAddress(std::unique_ptr& ipc_message) { - Logger::Log(message, LogLevel::INFO); -} +#ifdef TRITON_ENABLE_GPU + bool has_exception = false; + std::string error_string; + std::unique_ptr error_string_shm; -void -Logger::LogWarn(const std::string& message) -{ - Logger::Log(message, LogLevel::WARNING); -} + CUDAMemPoolMessage* cuda_pool_message_ptr = nullptr; + try { + AllocatedSharedMemory cuda_handle_shm = + shm_pool_->Load(ipc_message->Args()); + cuda_pool_message_ptr = cuda_handle_shm.data_.get(); + + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + void* cuda_pool_address; + cuda_api.OpenCudaHandle( + cuda_pool_message_ptr->device_id, &cuda_pool_message_ptr->cuda_handle, + &cuda_pool_address); + shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress( + cuda_pool_message_ptr->device_id, cuda_pool_address); + } + catch (const PythonBackendException& pb_exception) { + has_exception = true; + error_string = pb_exception.what(); + shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress( + cuda_pool_message_ptr->device_id, nullptr); + } -void -Logger::LogError(const std::string& message) -{ - Logger::Log(message, LogLevel::ERROR); + if (has_exception) { + LOG_INFO << "Failed to initialize CUDA shared memory pool in Python stub: " + << error_string; + cuda_pool_message_ptr->has_error = true; + cuda_pool_message_ptr->is_error_set = false; + + LOG_IF_EXCEPTION( + error_string_shm = PbString::Create(shm_pool_, error_string)); + if (error_string_shm != nullptr) { + cuda_pool_message_ptr->is_error_set = true; + cuda_pool_message_ptr->error = error_string_shm->ShmHandle(); + } + } + + { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + cuda_pool_message_ptr->waiting_on_stub = true; + ipc_message->ResponseCondition()->notify_all(); + while (cuda_pool_message_ptr->waiting_on_stub) { + ipc_message->ResponseCondition()->wait(lock); + } + } +#endif } void -Logger::LogVerbose(const std::string& message) +Stub::ProcessBLSResponseDecoupled(std::unique_ptr& ipc_message) { - Logger::Log(message, LogLevel::VERBOSE); -} + ResponseBatch* response_batch = nullptr; + bi::managed_external_buffer::handle_t* response_handle = nullptr; + std::unique_ptr infer_response; + bool responses_is_set = false; + PythonBackendException pb_exception(std::string{}); -const std::string -Logger::LeadingLogChar(const LogLevel& level) -{ - switch (level) { - case LogLevel::WARNING: - return "W"; - case LogLevel::ERROR: - return "E"; - case LogLevel::INFO: - case LogLevel::VERBOSE: - default: - return "I"; + try { + AllocatedSharedMemory response_batch_shm = + shm_pool_->Load(ipc_message->Args()); + response_batch = + reinterpret_cast(response_batch_shm.data_.get()); + response_handle = reinterpret_cast( + response_batch_shm.data_.get() + sizeof(ResponseBatch)); + responses_is_set = true; + + if (response_batch->has_error) { + if (response_batch->is_error_set) { + std::unique_ptr pb_string = + PbString::LoadFromSharedMemory(shm_pool_, response_batch->error); + infer_response = std::make_unique( + std::vector>{}, + std::make_shared(pb_string->String())); + } else { + infer_response = std::make_unique( + std::vector>{}, + std::make_shared( + "An error occurred while performing BLS request.")); + } + } + + if (responses_is_set) { + infer_response = InferResponse::LoadFromSharedMemory( + shm_pool_, *response_handle, true /* open cuda handle */); + + for (auto& output_tensor : infer_response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + uint64_t memory_release_id = + output_tensor->Memory()->MemoryReleaseId(); + output_tensor->Memory()->SetMemoryReleaseCallback( + [this, memory_release_id]() { + this->MemoryManagerQueue()->Push(memory_release_id); + }); + } + } + } else { + infer_response = std::make_unique( + std::vector>{}, + std::make_shared( + "An error occurred while performing BLS request.")); + } + } + catch (const PythonBackendException& pb_exception) { + infer_response = std::make_unique( + std::vector>{}, + std::make_shared(pb_exception.what())); } -} -void -Logger::SetBackendLoggingActive(bool status) -{ - backend_logging_active_ = status; -} + { + std::lock_guard lock(response_iterator_map_mu_); + if (response_iterator_map_.find(infer_response->Id()) != + response_iterator_map_.end()) { + response_iterator_map_[infer_response->Id()]->EnqueueResponse( + std::move(infer_response)); + } else { + auto response_iterator = + std::make_shared(std::move(infer_response)); + response_iterator_map_.insert( + std::pair>( + response_iterator->Id(), response_iterator)); + } + } -bool -Logger::BackendLoggingActive() -{ - return backend_logging_active_; + { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + response_batch->waiting_on_stub = true; + ipc_message->ResponseCondition()->notify_all(); + } } PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) { - py::class_>(module, "TritonError") - .def(py::init()) - .def("message", &PbError::Message); + py::class_> triton_error( + module, "TritonError"); + py::enum_(triton_error, "__ErrorCode") + .value("UNKNOWN", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_UNKNOWN) + .value("INTERNAL", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_INTERNAL) + .value("NOT_FOUND", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_NOT_FOUND) + .value( + "INVALID_ARG", + TRITONSERVER_Error_Code::TRITONSERVER_ERROR_INVALID_ARG) + .value( + "UNAVAILABLE", + TRITONSERVER_Error_Code::TRITONSERVER_ERROR_UNAVAILABLE) + .value( + "UNSUPPORTED", + TRITONSERVER_Error_Code::TRITONSERVER_ERROR_UNSUPPORTED) + .value( + "ALREADY_EXISTS", + TRITONSERVER_Error_Code::TRITONSERVER_ERROR_ALREADY_EXISTS) + .value("CANCELLED", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_CANCELLED) + .export_values(); + triton_error.def_property_readonly_static( + "UNKNOWN", + [](py::object /* self */) { return TRITONSERVER_ERROR_UNKNOWN; }); + triton_error.def_property_readonly_static( + "INTERNAL", + [](py::object /* self */) { return TRITONSERVER_ERROR_INTERNAL; }); + triton_error.def_property_readonly_static( + "NOT_FOUND", + [](py::object /* self */) { return TRITONSERVER_ERROR_NOT_FOUND; }); + triton_error.def_property_readonly_static( + "INVALID_ARG", + [](py::object /* self */) { return TRITONSERVER_ERROR_INVALID_ARG; }); + triton_error.def_property_readonly_static( + "UNAVAILABLE", + [](py::object /* self */) { return TRITONSERVER_ERROR_UNAVAILABLE; }); + triton_error.def_property_readonly_static( + "UNSUPPORTED", + [](py::object /* self */) { return TRITONSERVER_ERROR_UNSUPPORTED; }); + triton_error.def_property_readonly_static( + "ALREADY_EXISTS", + [](py::object /* self */) { return TRITONSERVER_ERROR_ALREADY_EXISTS; }); + triton_error.def_property_readonly_static( + "CANCELLED", + [](py::object /* self */) { return TRITONSERVER_ERROR_CANCELLED; }); + triton_error.def( + py::init(), + py::arg("message").none(false), + py::arg("code").none(false) = TRITONSERVER_ERROR_INTERNAL); + triton_error.def("code", &PbError::Code); + triton_error.def("message", &PbError::Message); py::class_>( module, "PreferredMemory") @@ -1304,31 +1634,58 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("preferred_device_id").none(false) = 0); py::enum_(module, "MemoryType") - .value("TRITONSERVER_MEMORY_GPU", PreferredMemory::MemoryType::GPU) - .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::CPU) + .value("TRITONSERVER_MEMORY_GPU", PreferredMemory::MemoryType::kGPU) + .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::kCPU) .export_values(); + py::class_>( + module, "InferenceTrace") + .def("get_context", [](InferenceTrace& self) -> py::object { + auto context = self.Context(); + if (context != "") { + return py::str(context); + } + return py::none(); + }); + py::class_>( module, "InferenceRequest") .def( - py::init([](const std::string& request_id, uint64_t correlation_id, - const std::vector>& inputs, - const std::vector& requested_output_names, - const std::string& model_name, - const int64_t model_version, const uint32_t flags, - const int32_t timeout, - const PreferredMemory& preferred_memory) { - std::set requested_outputs; - for (auto& requested_output_name : requested_output_names) { - requested_outputs.emplace(requested_output_name); - } - // FIXME: InferenceRequest parameters are not supported in BLS now. - return std::make_shared( - request_id, correlation_id, inputs, requested_outputs, - model_name, model_version, "" /*parameters*/, flags, timeout, - 0 /*response_factory_address*/, 0 /*request_address*/, - preferred_memory); - }), + py::init( + [](const std::string& request_id, + const py::object& correlation_id, + const std::vector>& inputs, + const std::vector& requested_output_names, + const std::string& model_name, const int64_t model_version, + const uint32_t flags, const uint64_t timeout, + const PreferredMemory& preferred_memory, + const InferenceTrace& trace, const py::object& parameters_) { + py::dict parameters = + PyDefaultArgumentToMutableType(parameters_); + std::set requested_outputs; + for (auto& requested_output_name : requested_output_names) { + requested_outputs.emplace(requested_output_name); + } + std::string parameters_str = PyParametersToJSON(parameters); + + CorrelationId correlation_id_obj; + if (py::isinstance(correlation_id)) { + correlation_id_obj = + CorrelationId(py::cast(correlation_id)); + } else if (py::isinstance(correlation_id)) { + correlation_id_obj = + CorrelationId(py::cast(correlation_id)); + } else { + throw PythonBackendException( + "Correlation ID must be integer or string"); + } + + return std::make_shared( + request_id, correlation_id_obj, inputs, requested_outputs, + model_name, model_version, parameters_str, flags, timeout, + 0 /*response_factory_address*/, 0 /*request_address*/, + preferred_memory, trace); + }), py::arg("request_id").none(false) = "", py::arg("correlation_id").none(false) = 0, py::arg("inputs").none(false), @@ -1337,16 +1694,28 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("model_version").none(false) = -1, py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0, py::arg("preferred_memory").none(false) = - PreferredMemory(PreferredMemory::DEFAULT, 0)) + PreferredMemory(PreferredMemory::kDefault, 0), + py::arg("trace").none(false) = InferenceTrace(), + py::arg("parameters").none(true) = py::none()) .def( "inputs", &InferRequest::Inputs, py::return_value_policy::reference_internal) .def("request_id", &InferRequest::RequestId) - .def("correlation_id", &InferRequest::CorrelationId) + .def( + "correlation_id", + [](InferRequest& self) -> py::object { + CorrelationId correlation_id = self.GetCorrelationId(); + if (correlation_id.Type() == CorrelationIdDataType::STRING) { + return py::cast(correlation_id.StringValue()); + } else { + return py::cast(correlation_id.UnsignedIntValue()); + } + }) .def("flags", &InferRequest::Flags) .def("set_flags", &InferRequest::SetFlags) .def("timeout", &InferRequest::Timeout) .def("parameters", &InferRequest::Parameters) + .def("trace", &InferRequest::GetTrace) .def( "exec", [](std::shared_ptr& infer_request, @@ -1370,11 +1739,6 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) [](std::shared_ptr& infer_request, const bool decoupled) { std::unique_ptr& stub = Stub::GetOrCreateInstance(); - if (stub->IsDecoupled()) { - throw PythonBackendException( - "Async BLS request execution is not support in the decoupled " - "API."); - } py::object loop = py::module_::import("asyncio").attr("get_running_loop")(); py::cpp_function callback = [&stub, infer_request, decoupled]() { @@ -1398,7 +1762,10 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def( "requested_output_names", &InferRequest::RequestedOutputNames, py::return_value_policy::reference_internal) - .def("get_response_sender", &InferRequest::GetResponseSender); + .def("get_response_sender", &InferRequest::GetResponseSender) + .def("is_cancelled", &InferRequest::IsCancelled) + .def("set_release_flags", &InferRequest::SetReleaseFlags), + py::arg("flags").none(false); py::class_>(module, "Tensor") .def(py::init(&PbTensor::FromNumpy)) @@ -1414,49 +1781,110 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("to_dlpack", &PbTensor::ToDLPack) .def("is_cpu", &PbTensor::IsCPU) .def("shape", &PbTensor::Dims) - .def("from_dlpack", &PbTensor::FromDLPack); + .def("from_dlpack", &PbTensor::FromDLPack) + .def("__dlpack__", &PbTensor::DLPack, py::arg("stream") = py::none()) + .def("__dlpack_device__", &PbTensor::DLPackDevice); py::class_>( module, "InferenceResponse") .def( - py::init< - const std::vector>&, - std::shared_ptr>(), - py::arg("output_tensors").none(false), - py::arg("error") = static_cast>(nullptr)) + py::init( + [](const std::vector>& output_tensors, + const std::shared_ptr& error, + const py::object& parameters_) { + py::dict parameters = + PyDefaultArgumentToMutableType(parameters_); + std::string parameters_str = PyParametersToJSON(parameters); + return std::make_shared( + output_tensors, error, parameters_str /* parameters */); + }), + py::arg("output_tensors") = py::list(), + py::arg("error") = static_cast>(nullptr), + py::arg("parameters") = py::none()) .def( "output_tensors", &InferResponse::OutputTensors, py::return_value_policy::reference) .def("has_error", &InferResponse::HasError) - .def("error", &InferResponse::Error); + .def("error", &InferResponse::Error) + .def("parameters", &InferResponse::Parameters); py::class_>( module, "InferenceResponseSender") .def( "send", &ResponseSender::Send, py::arg("response") = nullptr, - py::arg("flags") = 0); + py::arg("flags") = 0) + .def("is_cancelled", &ResponseSender::IsCancelled); py::class_>( module, "ResponseIterator") .def(py::init&>()) - .def("__iter__", &ResponseIterator::Iter, py::keep_alive<0, 1>()) - .def("__next__", &ResponseIterator::Next); + .def( + "__iter__", + [](ResponseIterator& it) -> ResponseIterator& { + it.Iter(); + return it; + }) + .def("__next__", &ResponseIterator::Next) + .def("cancel", &ResponseIterator::Cancel); py::class_ logger(module, "Logger"); py::enum_(logger, "LogLevel") - .value("INFO", LogLevel::INFO) - .value("WARNING", LogLevel::WARNING) - .value("ERROR", LogLevel::ERROR) - .value("VERBOSE", LogLevel::VERBOSE) + .value("INFO", LogLevel::kInfo) + .value("WARNING", LogLevel::kWarning) + .value("ERROR", LogLevel::kError) + .value("VERBOSE", LogLevel::kVerbose) .export_values(); logger.def_static( "log", py::overload_cast(&Logger::Log), - py::arg("message"), py::arg("level") = LogLevel::INFO); + py::arg("message"), py::arg("level") = LogLevel::kInfo); logger.def_static("log_info", &Logger::LogInfo, py::arg("message")); logger.def_static("log_warn", &Logger::LogWarn, py::arg("message")); logger.def_static("log_error", &Logger::LogError, py::arg("message")); logger.def_static("log_verbose", &Logger::LogVerbose, py::arg("message")); + py::class_>(module, "Metric") + .def("increment", &Metric::SendIncrementRequest) + .def("set", &Metric::SendSetValueRequest) + .def("observe", &Metric::SendObserveRequest) + .def("value", &Metric::SendGetValueRequest); + + py::enum_(module, "MetricKind") + .value("COUNTER", MetricKind::kCounter) + .value("GAUGE", MetricKind::kGauge) + .value("HISTOGRAM", MetricKind::kHistogram) + .export_values(); + + py::class_>( + module, "MetricFamily") + .def( + py::init(&MetricFamily::CreateMetricFamily), + py::arg("name").none(false), py::arg("description").none(false), + py::arg("kind").none(false)) + .def( + "Metric", &MetricFamily::CreateMetric, + py::arg("labels").none(true) = py::none(), + py::arg("buckets").none(true) = py::none()); + module.attr("MetricFamily").attr("COUNTER") = MetricKind::kCounter; + module.attr("MetricFamily").attr("GAUGE") = MetricKind::kGauge; + module.attr("MetricFamily").attr("HISTOGRAM") = MetricKind::kHistogram; + + module.def( + "load_model", &LoadModel, py::arg("model_name").none(false), + py::arg("config").none(false) = "", + py::arg("files").none(true) = py::none()); + module.def( + "unload_model", &UnloadModel, py::arg("model_name").none(false), + py::arg("unload_dependents").none(false) = false); + module.def( + "is_model_ready", &IsModelReady, py::arg("model_name").none(false), + py::arg("model_version").none(false) = ""); + + // This function is not part of the public API for Python backend. This is + // only used for internal callbacks. + module.def( + "async_event_future_done_callback", &AsyncEventFutureDoneCallback, + py::arg("py_future").none(false)); + // This class is not part of the public API for Python backend. This is only // used for internal testing purposes. py::class_(module, "SharedMemory") @@ -1466,6 +1894,90 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) module, "TritonModelException"); } + +void +ModelContext::Init( + const std::string& model_path, const std::string& runtime_modeldir, + const std::string& triton_install_path, const std::string& model_version) +{ + const char os_slash = std::filesystem::path::preferred_separator; + type_ = ModelType::kDefault; + if (runtime_modeldir != "DEFAULT") { + // For python based backends, existence of `model.py` in the corresponding + // backend folder happens on the core side, so we can omit this check here. + python_model_path_ = runtime_modeldir + os_slash + "model.py"; + type_ = ModelType::kBackend; + } else { + python_model_path_ = model_path; + // Check if model file exists in this path. + struct stat buffer; + if (stat(python_model_path_.c_str(), &buffer) != 0) { + throw PythonBackendException( + ("Python model file not found in \'" + model_path + "\'")); + } + } + + model_dir_ = model_path.substr(0, model_path.find_last_of(os_slash)); + python_backend_folder_ = triton_install_path; + model_version_ = model_version; + runtime_modeldir_ = runtime_modeldir; +} + +void +ModelContext::StubSetup(py::module& sys) +{ + const char os_slash = std::filesystem::path::preferred_separator; + std::string model_name = + python_model_path_.substr(python_model_path_.find_last_of(os_slash) + 1); + + // Model name without the .py extension + auto dotpy_pos = model_name.find_last_of(".py"); + if (dotpy_pos == std::string::npos || dotpy_pos != model_name.size() - 1) { + throw PythonBackendException( + "Model name must end with '.py'. Model name is \"" + model_name + + "\"."); + } + // The position of last character of the string that is searched for is + // returned by 'find_last_of'. Need to manually adjust the position. + std::string model_name_trimmed = model_name.substr(0, dotpy_pos - 2); + + if (type_ == ModelType::kDefault) { + std::string model_path_parent = + python_model_path_.substr(0, python_model_path_.find_last_of(os_slash)); + std::string model_path_parent_parent = + model_path_parent.substr(0, model_path_parent.find_last_of(os_slash)); + sys.attr("path").attr("append")(model_path_parent); + sys.attr("path").attr("append")(model_path_parent_parent); + sys.attr("path").attr("append")(python_backend_folder_); + sys = py::module_::import( + (std::string(model_version_) + "." + model_name_trimmed).c_str()); + } else { + std::string model_path_parent = + python_model_path_.substr(0, python_model_path_.find_last_of(os_slash)); + std::string backend_model_dir(model_path_parent); + sys.attr("path").attr("append")(backend_model_dir); + sys.attr("path").attr("append")(python_backend_folder_); + sys = py::module_::import(model_name_trimmed.c_str()); + } +} + +#ifdef _WIN32 +bool +ParentProcessActive(DWORD parent_id) +{ + HANDLE parent = OpenProcess(PROCESS_ALL_ACCESS, FALSE, parent_id); + DWORD exit_code; + GetExitCodeProcess(parent, &exit_code); + return (exit_code == STILL_ACTIVE); +} +#else +bool +ParentProcessActive(pid_t parent_id) +{ + return (kill(parent_id, 0) == 0); +} +#endif + extern "C" { int @@ -1480,17 +1992,18 @@ main(int argc, char** argv) signal(SIGINT, SignalHandler); signal(SIGTERM, SignalHandler); - // Path to model.py + // Path to model std::string model_path = argv[1]; std::string shm_region_name = argv[2]; - int64_t shm_default_size = std::stoi(argv[3]); + int64_t shm_default_size = std::stol(argv[3]); std::vector model_path_tokens; // Find the package name from model path. size_t prev = 0, pos = 0; + const char os_slash = std::filesystem::path::preferred_separator; do { - pos = model_path.find("/", prev); + pos = model_path.find(os_slash, prev); if (pos == std::string::npos) pos = model_path.length(); std::string token = model_path.substr(prev, pos - prev); @@ -1505,30 +2018,41 @@ main(int argc, char** argv) exit(1); } std::string model_version = model_path_tokens[model_path_tokens.size() - 2]; - int64_t shm_growth_size = std::stoi(argv[4]); + int64_t shm_growth_size = std::stol(argv[4]); std::string triton_install_path = argv[6]; std::string name = argv[8]; + std::string runtime_modeldir = argv[9]; std::unique_ptr& stub = Stub::GetOrCreateInstance(); try { stub->Instantiate( shm_growth_size, shm_default_size, shm_region_name, model_path, model_version, argv[6] /* triton install path */, - std::stoi(argv[7]) /* IPCControl handle */, name); + std::stoi(argv[7]) /* IPCControl handle */, name, runtime_modeldir); } catch (const PythonBackendException& pb_exception) { LOG_INFO << "Failed to preinitialize Python stub: " << pb_exception.what(); logger.reset(); + stub.reset(); exit(1); } // Start the Python Interpreter py::scoped_interpreter guard{}; +#ifdef _WIN32 + DWORD parent_pid = (DWORD)std::stoul(argv[5]); +#else pid_t parent_pid = std::stoi(argv[5]); - +#endif std::atomic background_thread_running = {true}; std::thread background_thread = std::thread([&parent_pid, &background_thread_running, &stub, &logger] { + // Send a dummy message after the stub process is launched to notify the + // parent process that the health thread has started. + std::unique_ptr ipc_message = IPCMessage::Create( + stub->SharedMemory(), false /* inline_response */); + stub->SendIPCMessage(ipc_message); + while (background_thread_running) { // Every 300ms set the health variable to true. This variable is in // shared memory and will be set to false by the parent process. @@ -1538,7 +2062,7 @@ main(int argc, char** argv) stub->UpdateHealth(); - if (kill(parent_pid, 0) != 0) { + if (!ParentProcessActive(parent_pid)) { // When unhealthy, we should stop attempting to send // messages to the backend ASAP. if (stub->StubToParentServiceActive()) { diff --git a/src/pb_stub.h b/src/pb_stub.h index 24d94eb6..942ecd98 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -29,129 +29,70 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include + #include "infer_request.h" #include "infer_response.h" #include "ipc_message.h" #include "message_queue.h" +#include "metric.h" +#include "metric_family.h" +#include "pb_cancel.h" #include "pb_log.h" #include "pb_response_iterator.h" -#include "pb_utils.h" namespace bi = boost::interprocess; namespace py = pybind11; using namespace pybind11::literals; +#ifndef TRITON_ENABLE_GPU +using cudaStream_t = void*; +#endif + namespace triton { namespace backend { namespace python { -#define LOG_IF_EXCEPTION(X) \ - do { \ - try { \ - (X); \ - } \ - catch (const PythonBackendException& pb_exception) { \ - LOG_INFO << pb_exception.what(); \ - } \ - } while (false) - -#define LOG_EXCEPTION(E) \ - do { \ - LOG_INFO << E.what(); \ - } while (false) - -/// Macros that use current filename and line number. -#define LOG_INFO LOG_FL(__FILE__, __LINE__, LogLevel::INFO) -#define LOG_WARN LOG_FL(__FILE__, __LINE__, LogLevel::WARNING) -#define LOG_ERROR LOG_FL(__FILE__, __LINE__, LogLevel::ERROR) -#define LOG_VERBOSE LOG_FL(__FILE__, __LINE__, LogLevel::VERBOSE) - -class Logger { +class ModelContext { public: - Logger() { backend_logging_active_ = false; }; - ~Logger() { log_instance_.reset(); }; - /// Python client log function - static void Log(const std::string& message, LogLevel level = LogLevel::INFO); - - /// Python client log info function - static void LogInfo(const std::string& message); - - /// Python client warning function - static void LogWarn(const std::string& message); - - /// Python client log error function - static void LogError(const std::string& message); - - /// Python client log verbose function - static void LogVerbose(const std::string& message); - - /// Internal log function - void Log( - const std::string& filename, uint32_t lineno, LogLevel level, - const std::string& message); - - /// Log format helper function - const std::string LeadingLogChar(const LogLevel& level); - - /// Set PYBE Logging Status - void SetBackendLoggingActive(bool status); - - /// Get PYBE Logging Status - bool BackendLoggingActive(); - - /// Singleton Getter Function - static std::unique_ptr& GetOrCreateInstance(); - - DISALLOW_COPY_AND_ASSIGN(Logger); + // Scans and establishes path for serving the python model. + void Init( + const std::string& model_path, const std::string& platform, + const std::string& triton_install_path, const std::string& model_version); + // Sets up the python stub with appropriate paths. + void StubSetup(py::module& sys); - /// Flush the log. - void Flush() { std::cerr << std::flush; } + std::string& PythonModelPath() { return python_model_path_; } + std::string& ModelDir() { return model_dir_; } private: - static std::unique_ptr log_instance_; - bool backend_logging_active_; + std::string python_model_path_; + std::string model_dir_; + std::string model_version_; + std::string python_backend_folder_; + std::string runtime_modeldir_; + + // Triton supports python-based backends, + // i.e. backends that provide common `model.py`, that can be re-used + // between different models. `ModelType` helps to differentiate + // between models running with c++ python backend (ModelType::kDefault) + // and models running with python-based backend (ModelType::kBackend) + // at the time of ModelContext::StubSetup to properly set up paths. + enum ModelType { kDefault, kBackend }; + ModelType type_; }; -class LogMessage { - public: - /// Create a log message, stripping the path down to the filename only - LogMessage(const char* file, int line, LogLevel level) : level_(level) - { - std::string path(file); - size_t pos = path.rfind('/'); - if (pos != std::string::npos) { - path = path.substr(pos + 1, std::string::npos); - } - file_ = path; - line_ = static_cast(line); - } - /// Log message to console or send to backend (see Logger::Log for details) - ~LogMessage() +// The payload for the stub_to_parent message queue. This struct serves as a +// wrapper for different types of messages so that they can be sent through the +// same buffer. +struct UtilsMessagePayload { + UtilsMessagePayload( + const PYTHONSTUB_CommandType& command_type, void* utils_message_ptr) + : command_type(command_type), utils_message_ptr(utils_message_ptr) { - Logger::GetOrCreateInstance()->Log(file_, line_, level_, stream_.str()); } - - std::stringstream& stream() { return stream_; } - - private: - std::stringstream stream_; - std::string file_; - uint32_t line_; - LogLevel level_; + PYTHONSTUB_CommandType command_type; + void* utils_message_ptr; }; -#define LOG_FL(FN, LN, LVL) LogMessage((char*)(FN), LN, LVL).stream() - class Stub { public: Stub() : stub_to_parent_thread_(false), parent_to_stub_thread_(false){}; @@ -163,7 +104,8 @@ class Stub { const std::string& shm_region_name, const std::string& model_path, const std::string& model_version, const std::string& triton_install_path, bi::managed_external_buffer::handle_t ipc_control_handle, - const std::string& model_instance_name); + const std::string& model_instance_name, + const std::string& runtime_modeldir); /// Get the health of the stub process. bool& Health(); @@ -177,6 +119,9 @@ class Stub { /// Setup for the stub process py::module StubSetup(); + /// Return the path to the model + py::str GetModelDir() { return model_context_.ModelDir(); } + /// Set the model configuration for auto-complete void AutoCompleteModelConfig( bi::managed_external_buffer::handle_t string_handle, @@ -206,13 +151,28 @@ class Stub { /// Execute a batch of requests. void ProcessRequests(RequestBatch* request_batch_shm_ptr); - void ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr); + void ProcessReturnedResponses( + py::list py_requests, py::object py_responses_obj, + std::optional>& response_batch); + + void ProcessResponse(InferResponse* response); + + py::object GetAsyncEventLoop(); + + py::object RunCoroutine(py::object coroutine, bool in_background); + + void BackgroundFutureDone(const py::object& py_future); /// Get the memory manager message queue std::unique_ptr>& MemoryManagerQueue(); - void ProcessResponse(InferResponse* response); + /// Get the shared memory pool + std::unique_ptr& ShmPool() { return shm_pool_; } + + void ProcessBLSResponseDecoupled(std::unique_ptr& ipc_message); + void LoadGPUBuffers(std::unique_ptr& ipc_message); + bool IsDecoupled(); ~Stub(); @@ -229,7 +189,7 @@ class Stub { void ServiceStubToParentRequests(); /// Send client log to the python backend - void SendLogMessage(std::unique_ptr& log_send_message); + void SendLogMessage(std::unique_ptr& utils_msg_payload); /// Check if stub to parent message handler is running bool StubToParentServiceActive(); @@ -251,10 +211,28 @@ class Stub { std::shared_ptr infer_response); /// Send the id to the python backend for object cleanup - void SendCleanupId(void* id); + void SendCleanupId( + std::unique_ptr& utils_msg_payload, + const PYTHONSTUB_CommandType& command_type); + + /// Add cleanup id to queue. This is used for cleaning up the infer_payload + /// and the response factory for BLS decoupled response. + void EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type); + + /// Send the id to the python backend for request address retrieval and + /// cancellation + void SendCancelBLSRequest( + std::unique_ptr& utils_msg_payload); + + /// Add infer payload id to queue. This is used for retrieving the request + /// address from the infer_payload + void EnqueueCancelBLSRequest(PbBLSCancel* pb_bls_cancel); + + /// Add request cancellation query to queue + void EnqueueIsCancelled(PbCancel* pb_cancel); - /// Add cleanup id to queue - void EnqueueCleanupId(void* id); + /// Send request cancellation query to python backend + void SendIsCancelled(std::unique_ptr& utils_msg_payload); /// Is the stub initialized bool IsInitialized(); @@ -262,6 +240,30 @@ class Stub { /// Is the stub in the finalize stage bool IsFinalizing(); + /// Helper function to enqueue a utils message to the stub to parent message + /// buffer + void EnqueueUtilsMessage( + std::unique_ptr utils_msg_payload); + + /// Send the message to the python backend. MessageType should be either + // 'MetricFamilyMessage', 'MetricMessage' or 'ModelLoaderMessage'. + template + void SendMessage( + AllocatedSharedMemory& msg_shm, + PYTHONSTUB_CommandType command_type, + bi::managed_external_buffer::handle_t handle); + + /// Helper function to prepare the message. MessageType should be either + // 'MetricFamilyMessage', 'MetricMessage' or 'ModelLoaderMessage'. + template + void PrepareMessage(AllocatedSharedMemory& msg_shm); + + /// Helper function to retrieve a proxy stream for dlpack synchronization + /// for provided device + cudaStream_t GetProxyStream(const int& device_id); + + /// Get the CUDA memory pool address from the parent process. + void GetCUDAMemoryPoolAddress(std::unique_ptr& ipc_message); private: bi::interprocess_mutex* stub_mutex_; @@ -269,15 +271,15 @@ class Stub { bi::interprocess_mutex* parent_mutex_; bi::interprocess_condition* parent_cond_; bi::interprocess_mutex* health_mutex_; - std::string model_path_; - std::string model_version_; + ModelContext model_context_; std::string name_; - std::string triton_install_path_; IPCControlShm* ipc_control_; std::unique_ptr shm_pool_; py::object model_instance_; py::object deserialize_bytes_; py::object serialize_bytes_; + py::object async_event_loop_; + py::object background_futures_; std::unique_ptr> stub_message_queue_; std::unique_ptr> @@ -291,8 +293,7 @@ class Stub { bool finalizing_; static std::unique_ptr stub_instance_; std::vector> gpu_tensors_; - std::queue> log_request_buffer_; - std::queue bls_response_cleanup_buffer_; + std::queue> stub_to_parent_buffer_; std::thread stub_to_parent_queue_monitor_; bool stub_to_parent_thread_; std::mutex stub_to_parent_message_mu_; @@ -302,5 +303,72 @@ class Stub { std::mutex response_iterator_map_mu_; std::unordered_map> response_iterator_map_; + std::mutex dlpack_proxy_stream_pool_mu_; + std::unordered_map dlpack_proxy_stream_pool_; }; + +template +void +Stub::PrepareMessage(AllocatedSharedMemory& msg_shm) +{ + msg_shm = shm_pool_->Construct(); + MessageType* msg = msg_shm.data_.get(); + new (&(msg->mu)) bi::interprocess_mutex; + new (&(msg->cv)) bi::interprocess_condition; + msg->waiting_on_stub = false; + msg->is_error_set = false; + msg->has_error = false; +} + +template +void +Stub::SendMessage( + AllocatedSharedMemory& msg_shm, + PYTHONSTUB_CommandType command_type, + bi::managed_external_buffer::handle_t handle) +{ + PrepareMessage(msg_shm); + MessageType* msg = msg_shm.data_.get(); + msg->message = handle; + + std::unique_ptr ipc_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + ipc_message->Command() = command_type; + ipc_message->Args() = msg_shm.handle_; + + std::unique_lock guard{stub_to_parent_message_mu_}; + { + ScopedDefer _([&ipc_message, msg] { + { + bi::scoped_lock guard{msg->mu}; + msg->waiting_on_stub = false; + msg->cv.notify_all(); + } + }); + + { + bi::scoped_lock guard{msg->mu}; + SendIPCUtilsMessage(ipc_message); + while (!msg->waiting_on_stub) { + msg->cv.wait(guard); + } + } + } + if (msg->has_error) { + if (msg->is_error_set) { + std::unique_ptr pb_string = + PbString::LoadFromSharedMemory(shm_pool_, msg->error); + std::string err_message = + std::string( + "Failed to process the request for model '" + name_ + + "', message: ") + + pb_string->String(); + throw PythonBackendException(err_message); + } else { + std::string err_message = std::string( + "Failed to process the request for model '" + name_ + "'."); + throw PythonBackendException(err_message); + } + } +} }}} // namespace triton::backend::python diff --git a/src/pb_stub_log.cc b/src/pb_stub_log.cc new file mode 100644 index 00000000..d0b1ff97 --- /dev/null +++ b/src/pb_stub_log.cc @@ -0,0 +1,170 @@ +// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "pb_stub_log.h" + +#include + +#include "pb_stub.h" + + +namespace py = pybind11; + +namespace triton { namespace backend { namespace python { + +std::unique_ptr Logger::log_instance_; + +std::unique_ptr& +Logger::GetOrCreateInstance() +{ + if (Logger::log_instance_.get() == nullptr) { + Logger::log_instance_ = std::make_unique(); + } + + return Logger::log_instance_; +} + +// Bound function, called from the python client +void +Logger::Log(const std::string& message, LogLevel level) +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + py::object frame = py::module_::import("inspect").attr("currentframe"); + py::object caller_frame = frame(); + py::object info = py::module_::import("inspect").attr("getframeinfo"); + py::object caller_info = info(caller_frame); + py::object filename_python = caller_info.attr("filename"); + std::string filename = filename_python.cast(); + py::object lineno = caller_info.attr("lineno"); + uint32_t line = lineno.cast(); + + if (!stub->StubToParentServiceActive()) { + Logger::GetOrCreateInstance()->Log(filename, line, level, message); + } else { + std::unique_ptr log_msg(new PbLog(filename, line, message, level)); + stub->EnqueueLogRequest(log_msg); + } +} + +// Called internally (.e.g. LOG_ERROR << "Error"; ) +void +Logger::Log( + const std::string& filename, uint32_t lineno, LogLevel level, + const std::string& message) +{ + // If the log monitor service is not active yet, format + // and pass messages to cerr + if (!BackendLoggingActive()) { + std::string path(filename); + size_t pos = path.rfind(std::filesystem::path::preferred_separator); + if (pos != std::string::npos) { + path = path.substr(pos + 1, std::string::npos); + } +#ifdef _WIN32 + std::stringstream ss; + SYSTEMTIME system_time; + GetSystemTime(&system_time); + ss << LeadingLogChar(level) << std::setfill('0') << std::setw(2) + << system_time.wMonth << std::setw(2) << system_time.wDay << ' ' + << std::setw(2) << system_time.wHour << ':' << std::setw(2) + << system_time.wMinute << ':' << std::setw(2) << system_time.wSecond + << '.' << std::setw(6) << system_time.wMilliseconds * 1000 << ' ' + << static_cast(GetCurrentProcessId()) << ' ' << path << ':' + << lineno << "] "; +#else + std::stringstream ss; + struct timeval tv; + gettimeofday(&tv, NULL); + struct tm tm_time; + gmtime_r(((time_t*)&(tv.tv_sec)), &tm_time); + ss << LeadingLogChar(level) << std::setfill('0') << std::setw(2) + << (tm_time.tm_mon + 1) << std::setw(2) << tm_time.tm_mday << " " + << std::setw(2) << tm_time.tm_hour << ':' << std::setw(2) + << tm_time.tm_min << ':' << std::setw(2) << tm_time.tm_sec << "." + << std::setw(6) << tv.tv_usec << ' ' << static_cast(getpid()) + << ' ' << path << ':' << lineno << "] "; + std::cerr << ss.str() << " " << message << std::endl; +#endif + } else { + // Ensure we do not create a stub instance before it has initialized + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + std::unique_ptr log_msg(new PbLog(filename, lineno, message, level)); + stub->EnqueueLogRequest(log_msg); + } +} + +void +Logger::LogInfo(const std::string& message) +{ + Logger::Log(message, LogLevel::kInfo); +} + +void +Logger::LogWarn(const std::string& message) +{ + Logger::Log(message, LogLevel::kWarning); +} + +void +Logger::LogError(const std::string& message) +{ + Logger::Log(message, LogLevel::kError); +} + +void +Logger::LogVerbose(const std::string& message) +{ + Logger::Log(message, LogLevel::kVerbose); +} + +const std::string +Logger::LeadingLogChar(const LogLevel& level) +{ + switch (level) { + case LogLevel::kWarning: + return "W"; + case LogLevel::kError: + return "E"; + case LogLevel::kInfo: + case LogLevel::kVerbose: + default: + return "I"; + } +} + +void +Logger::SetBackendLoggingActive(bool status) +{ + backend_logging_active_ = status; +} + +bool +Logger::BackendLoggingActive() +{ + return backend_logging_active_; +} + +}}} // namespace triton::backend::python diff --git a/src/pb_stub_log.h b/src/pb_stub_log.h new file mode 100644 index 00000000..df67eba8 --- /dev/null +++ b/src/pb_stub_log.h @@ -0,0 +1,134 @@ +// Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include + +#include "pb_utils.h" + +namespace triton { namespace backend { namespace python { + +#define LOG_IF_EXCEPTION(X) \ + do { \ + try { \ + (X); \ + } \ + catch (const PythonBackendException& pb_exception) { \ + LOG_INFO << pb_exception.what(); \ + } \ + } while (false) + +#define LOG_EXCEPTION(E) \ + do { \ + LOG_INFO << E.what(); \ + } while (false) + +/// Macros that use current filename and line number. +#define LOG_INFO LOG_FL(__FILE__, __LINE__, LogLevel::kInfo) +#define LOG_WARN LOG_FL(__FILE__, __LINE__, LogLevel::kWarning) +#define LOG_ERROR LOG_FL(__FILE__, __LINE__, LogLevel::kError) +#define LOG_VERBOSE LOG_FL(__FILE__, __LINE__, LogLevel::kVerbose) + +class Logger { + public: + Logger() { backend_logging_active_ = false; }; + ~Logger() { log_instance_.reset(); }; + /// Python client log function + static void Log(const std::string& message, LogLevel level = LogLevel::kInfo); + + /// Python client log info function + static void LogInfo(const std::string& message); + + /// Python client warning function + static void LogWarn(const std::string& message); + + /// Python client log error function + static void LogError(const std::string& message); + + /// Python client log verbose function + static void LogVerbose(const std::string& message); + + /// Internal log function + void Log( + const std::string& filename, uint32_t lineno, LogLevel level, + const std::string& message); + + /// Log format helper function + const std::string LeadingLogChar(const LogLevel& level); + + /// Set PYBE Logging Status + void SetBackendLoggingActive(bool status); + + /// Get PYBE Logging Status + bool BackendLoggingActive(); + + /// Singleton Getter Function + static std::unique_ptr& GetOrCreateInstance(); + + DISALLOW_COPY_AND_ASSIGN(Logger); + + /// Flush the log. + void Flush() { std::cerr << std::flush; } + + private: + static std::unique_ptr log_instance_; + bool backend_logging_active_; +}; + +class LogMessage { + public: + /// Create a log message, stripping the path down to the filename only + LogMessage(const char* file, int line, LogLevel level) : level_(level) + { + std::string path(file); + const char os_slash = std::filesystem::path::preferred_separator; + size_t pos = path.rfind(os_slash); + if (pos != std::string::npos) { + path = path.substr(pos + 1, std::string::npos); + } + file_ = path; + line_ = static_cast(line); + } + /// Log message to console or send to backend (see Logger::Log for details) + ~LogMessage() + { + Logger::GetOrCreateInstance()->Log(file_, line_, level_, stream_.str()); + } + + std::stringstream& stream() { return stream_; } + + private: + std::stringstream stream_; + std::string file_; + uint32_t line_; + LogLevel level_; +}; + +#define LOG_FL(FN, LN, LVL) LogMessage((char*)(FN), LN, LVL).stream() + +}}} // namespace triton::backend::python diff --git a/src/pb_stub_utils.cc b/src/pb_stub_utils.cc index 315f74a4..9e05feae 100644 --- a/src/pb_stub_utils.cc +++ b/src/pb_stub_utils.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "pb_stub_utils.h" + #include "pb_utils.h" namespace triton { namespace backend { namespace python { @@ -167,6 +168,8 @@ triton_to_pybind_dtype(TRITONSERVER_DataType data_type) dtype_numpy = py::dtype(py::format_descriptor::format()); break; case TRITONSERVER_TYPE_BF16: + // NOTE: Currently skipping this call via `if (BF16)` check, but may + // want to better handle this or set some default/invalid dtype. throw PythonBackendException("TYPE_BF16 not currently supported."); case TRITONSERVER_TYPE_INVALID: throw PythonBackendException("Dtype is invalid."); @@ -189,8 +192,8 @@ triton_to_dlpack_type(TRITONSERVER_DataType triton_dtype) dl_dtype.lanes = 1; switch (triton_dtype) { case TRITONSERVER_TYPE_BOOL: - dl_code = DLDataTypeCode::kDLInt; - dt_size = 1; + dl_code = DLDataTypeCode::kDLBool; + dt_size = 8; break; case TRITONSERVER_TYPE_UINT8: dl_code = DLDataTypeCode::kDLUInt; @@ -239,6 +242,10 @@ triton_to_dlpack_type(TRITONSERVER_DataType triton_dtype) case TRITONSERVER_TYPE_BYTES: throw PythonBackendException( "TYPE_BYTES tensors cannot be converted to DLPack."); + case TRITONSERVER_TYPE_BF16: + dl_code = DLDataTypeCode::kDLBfloat; + dt_size = 16; + break; default: throw PythonBackendException( @@ -279,8 +286,6 @@ dlpack_to_triton_type(const DLDataType& data_type) return TRITONSERVER_TYPE_INT32; } else if (data_type.bits == 64) { return TRITONSERVER_TYPE_INT64; - } else if (data_type.bits == 1) { - return TRITONSERVER_TYPE_BOOL; } } @@ -296,6 +301,21 @@ dlpack_to_triton_type(const DLDataType& data_type) } } + if (data_type.code == DLDataTypeCode::kDLBool) { + if (data_type.bits == 8) { + return TRITONSERVER_TYPE_BOOL; + } + } + + if (data_type.code == DLDataTypeCode::kDLBfloat) { + if (data_type.bits != 16) { + throw PythonBackendException( + "Expected BF16 tensor to have 16 bits, but had: " + + std::to_string(data_type.bits)); + } + return TRITONSERVER_TYPE_BF16; + } + return TRITONSERVER_TYPE_INVALID; } }}} // namespace triton::backend::python diff --git a/src/pb_stub_utils.h b/src/pb_stub_utils.h index 598bf436..6068fba9 100644 --- a/src/pb_stub_utils.h +++ b/src/pb_stub_utils.h @@ -28,6 +28,7 @@ #include #include #include + #include "triton/core/tritonserver.h" namespace py = pybind11; diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index c4b08b7f..26e77586 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -29,15 +29,82 @@ #endif // TRITON_ENABLE_GPU #ifdef TRITON_PB_STUB +#include "pb_stub.h" #include "pb_stub_utils.h" namespace py = pybind11; #endif #include "pb_tensor.h" +// WAR for undefined ssize_t on Windows: https://stackoverflow.com/a/35368387 +#if defined(_MSC_VER) +#include +typedef SSIZE_T ssize_t; +#endif + +#include +#include +#include namespace triton { namespace backend { namespace python { #ifdef TRITON_PB_STUB +py::array +deserialize_bytes_tensor_cpp(const uint8_t* data, size_t data_size) +{ + if (data_size == 0) { + py::module numpy = py::module::import("numpy"); + return numpy.attr("empty")(0, py::dtype("object")); + } + + // First pass: count the number of strings and calculate total size + size_t offset = 0; + size_t num_strings = 0; + size_t total_string_size = 0; + + while (offset < data_size) { + if (offset + 4 > data_size) { + throw PythonBackendException( + "Invalid bytes tensor data: incomplete length field"); + } + + // Read 4-byte length (little-endian) + uint32_t length = *reinterpret_cast(data + offset); + offset += 4; + + if (offset + length > data_size) { + throw PythonBackendException( + "Invalid bytes tensor data: string extends beyond buffer"); + } + + num_strings++; + total_string_size += length; + offset += length; + } + + // Create numpy array of objects using pybind11's numpy module + py::module numpy = py::module::import("numpy"); + py::array result = numpy.attr("empty")(num_strings, py::dtype("object")); + auto result_ptr = static_cast(result.request().ptr); + + // Second pass: extract strings + offset = 0; + size_t string_index = 0; + + while (offset < data_size) { + uint32_t length = *reinterpret_cast(data + offset); + offset += 4; + + // Create Python bytes object using pybind11 + py::bytes bytes_obj(reinterpret_cast(data + offset), length); + Py_INCREF(bytes_obj.ptr()); // Increment reference count + result_ptr[string_index] = bytes_obj.ptr(); + string_index++; + offset += length; + } + + return result; +} + PbTensor::PbTensor(const std::string& name, py::array& numpy_array) : name_(name) { @@ -146,19 +213,17 @@ PbTensor::PbTensor( #ifdef TRITON_PB_STUB if (memory_type_ == TRITONSERVER_MEMORY_CPU || memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) { - if (dtype != TRITONSERVER_TYPE_BYTES) { + if (dtype == TRITONSERVER_TYPE_BF16) { + // No native numpy representation for BF16. DLPack should be used instead. + numpy_array_ = py::none(); + } else if (dtype != TRITONSERVER_TYPE_BYTES) { py::object numpy_array = py::array(triton_to_pybind_dtype(dtype_), dims_, (void*)memory_ptr_); numpy_array_ = numpy_array.attr("view")(triton_to_numpy_type(dtype_)); } else { - py::object numpy_array = py::array( - triton_to_pybind_dtype(TRITONSERVER_TYPE_UINT8), {byte_size}, - (void*)memory_ptr_); - py::module triton_pb_utils = - py::module::import("triton_python_backend_utils"); - numpy_array_ = - triton_pb_utils.attr("deserialize_bytes_tensor")(numpy_array) - .attr("reshape")(dims); + py::object numpy_array = deserialize_bytes_tensor_cpp( + static_cast(memory_ptr_), byte_size_); + numpy_array_ = numpy_array.attr("reshape")(dims_); } } else { numpy_array_ = py::none(); @@ -225,12 +290,43 @@ delete_unused_dltensor(PyObject* dlp) } } + std::shared_ptr PbTensor::FromNumpy(const std::string& name, py::array& numpy_array) { return std::make_shared(name, numpy_array); } +DLDeviceType +PbTensor::DeviceType() +{ + DLDeviceType device_type{}; + + switch (memory_type_) { + case TRITONSERVER_MEMORY_GPU: + device_type = DLDeviceType::kDLCUDA; + break; + case TRITONSERVER_MEMORY_CPU: + device_type = DLDeviceType::kDLCPU; + break; + case TRITONSERVER_MEMORY_CPU_PINNED: + device_type = DLDeviceType::kDLCUDAHost; + break; + } + + return device_type; +} + +py::capsule +PbTensor::DLPack(const py::object& stream) +{ + // Here external tensor requests PbTensor's `__dlpack__` method to provide + // a PyCapsule. By the design of PbTensor, in a GPU case no pending work + // is scheduled to work with PbTensor's data and we can simply pass + // the capsule without a synchronization. + return this->ToDLPack(); +} + py::capsule PbTensor::ToDLPack() { @@ -264,28 +360,24 @@ PbTensor::ToDLPack() py::handle tensor_handle = py::cast(tensor); // Increase the reference count by one to make sure that the DLPack - // represenation doesn't become invalid when the tensor object goes out of + // representation doesn't become invalid when the tensor object goes out of // scope. tensor_handle.inc_ref(); dlpack_tensor->dl_tensor.device.device_id = memory_type_id_; + dlpack_tensor->dl_tensor.device.device_type = this->DeviceType(); dlpack_tensor->dl_tensor.dtype = triton_to_dlpack_type(dtype_); - switch (memory_type_) { - case TRITONSERVER_MEMORY_GPU: - dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCUDA; - break; - case TRITONSERVER_MEMORY_CPU: - dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU; - break; - case TRITONSERVER_MEMORY_CPU_PINNED: - dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCUDAHost; - break; - } - return py::capsule( static_cast(dlpack_tensor), "dltensor", &delete_unused_dltensor); } + +std::pair +PbTensor::DLPackDevice() +{ + return std::pair(this->DeviceType(), memory_type_id_); +} + #endif // TRITON_PB_STUB void @@ -305,12 +397,88 @@ PbTensor::Memory() #ifdef TRITON_PB_STUB std::shared_ptr -PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor) +PbTensor::FromDLPack(const std::string& name, const py::object& tensor) { if (name == "") { throw PythonBackendException("Tensor name cannot be an empty string."); } + if (py::isinstance(tensor)) { + return FromDLPackCapsule(name, tensor); + } + + if (!py::hasattr(tensor, "__dlpack__") || + !py::hasattr(tensor, "__dlpack_device__")) { + throw PythonBackendException( + "Provided tensor is not supported. Tensor must be a DLPack capsule \ + or have `__dlpack__` and `__dlpack_device__` attributes"); + } + + auto capsule_device_info = + tensor.attr("__dlpack_device__")().cast>(); + if (capsule_device_info.first == DLDeviceType::kDLCUDA) { +#ifdef TRITON_ENABLE_GPU + int current_device; + cudaError_t err = cudaGetDevice(¤t_device); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + if (err != cudaSuccess) { + throw PythonBackendException("Failed to get current CUDA device id."); + } + ScopedSetDevice scoped_set_device(capsule_device_info.second); + + bool overridden = (current_device != capsule_device_info.second); + cudaStream_t proxy_stream = stub->GetProxyStream(current_device); + + // Array API requirements for the stream argument: + // stream = 1 the legacy default stream (in this case should + // synchronize on CUDA stream 0) + // For CPU, `stream=None` is the only accepted argument + // according to array API. For GPU, when `stream=None` producer + // must assume the legacy default stream. Reference: + // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html + auto ptr_to_tensor = FromDLPackCapsule( + name, tensor.attr("__dlpack__")( + py::arg("stream") = + py::int_(reinterpret_cast(proxy_stream)))); + + // In case there is a pending job on the data, where this capsule + // is pointing to, we need to wait for it to finish before returning + // capsule. + // We synchronize on the proxy stream explicitly since that what we + // pass to external tensor's `__dlpack__` method. + err = cudaStreamSynchronize(proxy_stream); + if (err != cudaSuccess) { + throw PythonBackendException( + "Failed to synchronize CUDA device with id " + + std::to_string( + overridden ? capsule_device_info.second : current_device)); + } + + return ptr_to_tensor; +#else + throw PythonBackendException( + "DLPack capsule passed pointer to memory allocated on GPU device, \ + when GPU is not available"); +#endif + } else if ( + capsule_device_info.first != DLDeviceType::kDLCPU && + capsule_device_info.first != DLDeviceType::kDLCUDAHost) { + throw PythonBackendException( + "DLDevice type " + std::to_string(capsule_device_info.first) + + " is not support by Python backend."); + } + // If data is located on a CPU, `stream=None` is the only accepted argument + // according to array API. + // Reference: + // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html + return FromDLPackCapsule( + name, tensor.attr("__dlpack__")(py::arg("stream") = py::none())); +} + +std::shared_ptr +PbTensor::FromDLPackCapsule( + const std::string& name, const py::capsule& dlpack_tensor) +{ DLManagedTensor* dl_managed_tensor = static_cast(dlpack_tensor.get_pointer()); @@ -330,12 +498,14 @@ PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor) int64_t calculated_stride{1}; bool is_contiguous_c_order = true; for (size_t i = 1; i < dims.size(); i++) { - if (strides[ndim - i] != calculated_stride) { - is_contiguous_c_order = false; - break; - } + if (dims[ndim - i] != 1) { + if (strides[ndim - i] != calculated_stride) { + is_contiguous_c_order = false; + break; + } - calculated_stride *= dims[ndim - i]; + calculated_stride *= dims[ndim - i]; + } } if (!is_contiguous_c_order) { @@ -390,6 +560,14 @@ PbTensor::~PbTensor() noexcept(false) { pb_memory_.reset(); DeleteDLPack(); + +#ifdef TRITON_PB_STUB + { + py::gil_scoped_acquire acquire; + py::array numpy_array_local(std::move(numpy_array_)); + py::array numpy_array_serialized_local(std::move(numpy_array_serialized_)); + } +#endif } const std::string& @@ -402,12 +580,18 @@ PbTensor::Name() const const py::array* PbTensor::AsNumpy() const { - if (IsCPU()) { - return &numpy_array_; - } else { + if (!IsCPU()) { throw PythonBackendException( "Tensor is stored in GPU and cannot be converted to NumPy."); } + + if (dtype_ == TRITONSERVER_TYPE_BF16) { + throw PythonBackendException( + "Tensor dtype is BF16 and cannot be converted to NumPy. Use " + "to_dlpack() and from_dlpack() instead."); + } + + return &numpy_array_; } #endif // TRITON_PB_STUB @@ -450,7 +634,7 @@ PbTensor::SaveToSharedMemory( if (!pb_memory_) { pb_memory_ = PbMemory::Create( - memory_type_, memory_type_id_, byte_size_, + shm_pool, memory_type_, memory_type_id_, byte_size_, reinterpret_cast(memory_ptr_), reinterpret_cast(tensor_shm_ptr_) + pb_memory_offset, shm_handle_ + pb_memory_offset, copy_gpu); @@ -480,7 +664,7 @@ PbTensor::LoadFromSharedMemory( if (tensor_shm_ptr->memory == 0) { std::size_t pb_memory_offset = name_offset + name_shm->Size(); pb_memory = PbMemory::LoadFromSharedMemory( - pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset, + shm_pool, pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset, open_cuda_handle); } else { pb_memory = PbMemory::LoadFromSharedMemory( @@ -533,19 +717,17 @@ PbTensor::PbTensor( #ifdef TRITON_PB_STUB if (memory_type_ == TRITONSERVER_MEMORY_CPU || memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) { - if (dtype_ != TRITONSERVER_TYPE_BYTES) { + if (dtype_ == TRITONSERVER_TYPE_BF16) { + // No native numpy representation for BF16. DLPack should be used instead. + numpy_array_ = py::none(); + } else if (dtype_ != TRITONSERVER_TYPE_BYTES) { py::object numpy_array = py::array(triton_to_pybind_dtype(dtype_), dims_, (void*)memory_ptr_); numpy_array_ = numpy_array.attr("view")(triton_to_numpy_type(dtype_)); } else { - py::object numpy_array = py::array( - triton_to_pybind_dtype(TRITONSERVER_TYPE_UINT8), {byte_size_}, - (void*)memory_ptr_); - py::module triton_pb_utils = - py::module::import("triton_python_backend_utils"); - numpy_array_ = - triton_pb_utils.attr("deserialize_bytes_tensor")(numpy_array) - .attr("reshape")(dims_); + py::object numpy_array = deserialize_bytes_tensor_cpp( + static_cast(memory_ptr_), byte_size_); + numpy_array_ = numpy_array.attr("reshape")(dims_); } } else { numpy_array_ = py::none(); diff --git a/src/pb_tensor.h b/src/pb_tensor.h index 912b50a4..4f97b643 100644 --- a/src/pb_tensor.h +++ b/src/pb_tensor.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -41,6 +41,7 @@ namespace py = pybind11; #include #include + #include "pb_memory.h" #include "pb_string.h" #include "pb_utils.h" @@ -98,8 +99,7 @@ class PbTensor { int64_t memory_type_id, void* memory_ptr, uint64_t byte_size, DLManagedTensor* dl_managed_tensor = nullptr); - /// This constructor is used when - /// loading the tensor from shared memory. + /// This constructor is used when loading the tensor from shared memory. /// \param tensor_shm The name of the tensor /// \param dims_shm Tensor dimensions /// \param pb_string Triton dtype @@ -112,11 +112,16 @@ class PbTensor { DISALLOW_COPY_AND_ASSIGN(PbTensor); #ifdef TRITON_PB_STUB - /// Construct a Python backend tensor using a DLPack - /// capsule. + /// Construct a Python backend tensor from an + /// external tensor. /// \param dlpack source dlpack tensor /// \param name name of the tensor static std::shared_ptr FromDLPack( + const std::string& name, const py::object& dlpack); + + /// Construct a Python backend tensor using a DLPack + /// capsule. + static std::shared_ptr FromDLPackCapsule( const std::string& name, const py::capsule& dlpack); /// Construct a Python backend tensor using a NumPy object. @@ -125,9 +130,23 @@ class PbTensor { static std::shared_ptr FromNumpy( const std::string& name, py::array& numpy_array); + /// Get device type in DLPack format. + DLDeviceType DeviceType(); + + /// Exports tensor for consumption by `from_dlpack()` as a DLPack capsule. + /// \param stream a Python integer representing a pointer to a stream, + /// on devices that support streams + /// \return Capsule object containing pointer to a DLPack object. + py::capsule DLPack(const py::object& stream); + /// Get a PyCapsule object containing the DLPack representation of the tensor. /// \return Capsule object containing pointer to a DLPack object. py::capsule ToDLPack(); + + /// Returns device type and device ID. + /// Meant for use within `from_dlpack()`. + /// \return a pair (device_type, device_id). + std::pair DLPackDevice(); #endif /// Get the name of the tensor diff --git a/src/pb_utils.cc b/src/pb_utils.cc index 3c607dea..79b45ec2 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -26,25 +26,23 @@ #include "pb_utils.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include + +#include + +#ifdef _WIN32 +#include + +#include +#else +#include #include -#include -#include -#include -#include -#include -#include -#include "scoped_defer.h" +#endif + +#ifndef _WIN32 +extern char** environ; +#endif + #ifdef TRITON_ENABLE_GPU #include @@ -57,45 +55,55 @@ namespace triton { namespace backend { namespace python { CUDAHandler::CUDAHandler() { - dl_open_handle_ = dlopen("libcuda.so", RTLD_LAZY); + dl_open_handle_ = LoadSharedObject("libcuda.so"); - // If libcuda.so is succesfully opened, it must be able to find - // "cuPointerGetAttribute" and "cuGetErrorString" symbols. + // If libcuda.so is successfully opened, it must be able to find + // "cuPointerGetAttribute", "cuGetErrorString", and + // "cuDevicePrimaryCtxGetState" symbols. if (dl_open_handle_ != nullptr) { - void* cu_pointer_get_attribute_fn = - dlsym(dl_open_handle_, "cuPointerGetAttribute"); + void* cu_pointer_get_attribute_fn = LocateSymbol("cuPointerGetAttribute"); if (cu_pointer_get_attribute_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuPointerGetAttribute'. Error: ") + - dlerror()); + std::string("Failed to locate 'cuPointerGetAttribute'. Error: ") + + LocateSymbolError()); } *((void**)&cu_pointer_get_attribute_fn_) = cu_pointer_get_attribute_fn; - void* cu_get_error_string_fn = dlsym(dl_open_handle_, "cuGetErrorString"); + void* cu_get_error_string_fn = LocateSymbol("cuGetErrorString"); if (cu_get_error_string_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuGetErrorString'. Error: ") + - dlerror()); + std::string("Failed to locate 'cuGetErrorString'. Error: ") + + LocateSymbolError()); } *((void**)&cu_get_error_string_fn_) = cu_get_error_string_fn; - void* cu_init_fn = dlsym(dl_open_handle_, "cuInit"); + void* cu_init_fn = LocateSymbol("cuInit"); if (cu_init_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuInit'. Error: ") + dlerror()); + std::string("Failed to locate 'cuInit'. Error: ") + + LocateSymbolError()); } *((void**)&cu_init_fn_) = cu_init_fn; + void* cu_device_primary_ctx_get_state_fn = + LocateSymbol("cuDevicePrimaryCtxGetState"); + if (cu_device_primary_ctx_get_state_fn == nullptr) { + throw PythonBackendException( + std::string( + "Failed to locate 'cuDevicePrimaryCtxGetState'. Error: ") + + LocateSymbolError()); + } + *((void**)&cu_device_primary_ctx_get_state_fn_) = + cu_device_primary_ctx_get_state_fn; + // Initialize the driver API. CUresult cuda_err = (*cu_init_fn_)(0 /* flags */); if (cuda_err != CUDA_SUCCESS) { const char* error_string; (*cu_get_error_string_fn_)(cuda_err, &error_string); - throw PythonBackendException( - std::string( - "failed to get cuda pointer device attribute: " + - std::string(error_string)) - .c_str()); + error_str_ = std::string("failed to call cuInit: ") + error_string; + CloseLibrary(); + dl_open_handle_ = nullptr; } } } @@ -130,41 +138,9 @@ CUDAHandler::OpenCudaHandle( void** data_ptr) { std::lock_guard guard{mu_}; - int current_device; + ScopedSetDevice scoped_set_device(memory_type_id); - // Save the previous device - cudaError_t err = cudaGetDevice(¤t_device); - if (err != cudaSuccess) { - throw PythonBackendException( - std::string("Failed to get the current CUDA device. error: ") + - cudaGetErrorString(err)); - } - - bool overridden = (current_device != memory_type_id); - - // Restore the previous device before returning from the function. - ScopedDefer _(std::bind([&overridden, ¤t_device] { - if (overridden) { - cudaError_t err = cudaSetDevice(current_device); - if (err != cudaSuccess) { - throw PythonBackendException( - "Failed to set the CUDA device to " + - std::to_string(current_device) + - ". error: " + cudaGetErrorString(err)); - } - } - })); - - if (overridden) { - err = cudaSetDevice(memory_type_id); - if (err != cudaSuccess) { - throw PythonBackendException( - "Failed to set the CUDA device to " + std::to_string(memory_type_id) + - ". error: " + cudaGetErrorString(err)); - } - } - - err = cudaIpcOpenMemHandle( + cudaError_t err = cudaIpcOpenMemHandle( data_ptr, *cuda_mem_handle, cudaIpcMemLazyEnablePeerAccess); if (err != cudaSuccess) { throw PythonBackendException( @@ -187,31 +163,8 @@ CUDAHandler::CloseCudaHandle(int64_t memory_type_id, void* data_ptr) cudaGetErrorString(err)); } - bool overridden = (current_device != memory_type_id); - // Restore the previous device before returning from the function. - ScopedDefer _(std::bind([&overridden, ¤t_device] { - if (overridden) { - cudaError_t err = cudaSetDevice(current_device); - if (err != cudaSuccess) { - throw PythonBackendException( - "Failed to set the CUDA device to " + - std::to_string(current_device) + - ". error: " + cudaGetErrorString(err)); - } - } - })); - - if (overridden) { - err = cudaSetDevice(memory_type_id); - if (err != cudaSuccess) { - throw PythonBackendException( - std::string("Failed to set the CUDA device to ") + - std::to_string(memory_type_id) + - ". error: " + cudaGetErrorString(err)); - } - } - + ScopedSetDevice scoped_set_device(memory_type_id); err = cudaIpcCloseMemHandle(data_ptr); if (err != cudaSuccess) { throw PythonBackendException( @@ -220,16 +173,137 @@ CUDAHandler::CloseCudaHandle(int64_t memory_type_id, void* data_ptr) } } +bool +CUDAHandler::HasPrimaryContext(int device) +{ + unsigned int ctx_flags; + int ctx_is_active = 0; + CUresult cuda_err = (*cu_device_primary_ctx_get_state_fn_)( + device, &ctx_flags, &ctx_is_active); + if (cuda_err != CUDA_SUCCESS) { + const char* error_string; + (*cu_get_error_string_fn_)(cuda_err, &error_string); + throw PythonBackendException( + std::string( + "failed to get primary context state: " + std::string(error_string)) + .c_str()); + } + + return ctx_is_active == 1; +} + +void +CUDAHandler::MaybeSetDevice(int device) +{ + if (HasPrimaryContext(device)) { + cudaError_t err = cudaSetDevice(device); + if (err != cudaSuccess) { + throw PythonBackendException( + std::string("Failed to set the CUDA device to ") + + std::to_string(device) + ". error: " + cudaGetErrorString(err)); + } + } +} + + CUDAHandler::~CUDAHandler() noexcept(false) { if (dl_open_handle_ != nullptr) { - int status = dlclose(dl_open_handle_); - if (status != 0) { - throw PythonBackendException("Failed to close the libcuda handle."); - } + CloseLibrary(); } } + +void* +CUDAHandler::LoadSharedObject(const char* filename) +{ +#ifdef _WIN32 + // NOTE: 'nvcuda.dll' is a placeholder library. Apparently, this should be the + // equivalent library for Windows, but need to verify. + return LoadLibraryA("nvcuda.dll"); +#else + return dlopen("libcuda.so", RTLD_LAZY); +#endif +} + +void* +CUDAHandler::LocateSymbol(const char* symbol) +{ +#ifdef _WIN32 + return GetProcAddress(static_cast(dl_open_handle_), symbol); +#else + return dlsym(dl_open_handle_, symbol); +#endif +} + + +std::string +CUDAHandler::LocateSymbolError() +{ +#ifdef _WIN32 + return std::to_string(GetLastError()); +#else + return dlerror(); #endif +} + +void +CUDAHandler::CloseLibrary() +{ + bool successful = true; +#ifdef _WIN32 + successful = (FreeLibrary(static_cast(dl_open_handle_)) != 0); +#else + successful = (dlclose(dl_open_handle_) == 0); +#endif + if (!successful) { + throw PythonBackendException("Failed to close the cuda library handle."); + } +} + + +ScopedSetDevice::ScopedSetDevice(int device) +{ + device_ = device; + THROW_IF_CUDA_ERROR(cudaGetDevice(¤t_device_)); + + if (current_device_ != device_) { + THROW_IF_CUDA_ERROR(cudaSetDevice(device_)); + } +} + +ScopedSetDevice::~ScopedSetDevice() +{ + if (current_device_ != device_) { + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + cuda_handler.MaybeSetDevice(current_device_); + } +} + +bool +IsUsingCUDAPool( + std::unique_ptr& cuda_pool, int64_t memory_type_id, + void* data) +{ + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + CUdeviceptr cuda_pool_address = 0; + cuda_api.PointerGetAttribute( + &cuda_pool_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + reinterpret_cast(data)); + + return ( + cuda_pool->CUDAPoolAddress(memory_type_id) == + reinterpret_cast(cuda_pool_address)); +} + +#endif // TRITON_ENABLE_GPU + +// FIXME: [DLIS-6078]: We should not need this function. However, some paths are +// being retrieved from core that are not platform-agnostic. +void +SanitizePath(std::string& path) +{ + std::replace(path.begin(), path.end(), '/', '\\'); +} #ifndef TRITON_PB_STUB std::shared_ptr @@ -248,5 +322,119 @@ WrapTritonErrorInSharedPtr(TRITONSERVER_Error* error) *response_error = error; return response_error; } +#endif // NOT TRITON_PB_STUB + +bool +IsValidIdentifier(const std::string& input) +{ + // Check for invalid characters + if (input.empty() || + input.find_first_of(INVALID_CHARS) != std::string::npos) { + return false; + } + + return true; +} + +bool +IsExecutableFile(const std::string& filepath) +{ + struct stat file_stat; + if (stat(filepath.c_str(), &file_stat) != 0) { + return false; + } + + // Check if it's a regular file and executable by owner + return S_ISREG(file_stat.st_mode) && (file_stat.st_mode & S_IXUSR); +} + +std::string +GenerateUUID() +{ + static boost::uuids::random_generator generator; + boost::uuids::uuid uuid = generator(); + return boost::uuids::to_string(uuid); +} + +// Helper function to get environment variables for Python virtual environments +std::map +ParseActivationScript(const std::string& activate_path) +{ + std::map env_vars; + + // Read the current environment as baseline +#ifndef _WIN32 + if (environ != nullptr) { + for (char** env = environ; *env != nullptr; env++) { + std::string env_str(*env); + size_t eq_pos = env_str.find('='); + if (eq_pos != std::string::npos) { + std::string key = env_str.substr(0, eq_pos); + std::string value = env_str.substr(eq_pos + 1); + env_vars[key] = value; + } + } + } #endif + + // Extract virtual environment root from activation script path + std::string venv_path = activate_path; + size_t bin_activate_pos = venv_path.find("/bin/activate"); + if (bin_activate_pos != std::string::npos) { + venv_path = venv_path.substr(0, bin_activate_pos); + } + + // Set standard virtual environment variables + env_vars["VIRTUAL_ENV"] = venv_path; + env_vars["VIRTUAL_ENV_PROMPT"] = "(" + venv_path + ")"; + + // Update PATH to include the virtual environment's bin directory + std::string new_path = venv_path + "/bin"; + if (env_vars.find("PATH") != env_vars.end()) { + new_path += ":" + env_vars["PATH"]; + } + env_vars["PATH"] = new_path; + + // Update LD_LIBRARY_PATH to include the virtual environment's lib directory + std::string new_lib_path = venv_path + "/lib"; + if (env_vars.find("LD_LIBRARY_PATH") != env_vars.end()) { + new_lib_path += ":" + env_vars["LD_LIBRARY_PATH"]; + } + env_vars["LD_LIBRARY_PATH"] = new_lib_path; + + // Remove PYTHONHOME if it exists + env_vars.erase("PYTHONHOME"); + + return env_vars; +} + +// Helper function to prepare environment array for execve +std::pair, std::vector> +PrepareEnvironment( + const std::map& env_vars, + const std::string& additional_lib_path) +{ + std::vector env_strings; + std::vector env_array; + + for (const auto& [key, value] : env_vars) { + std::string env_string; + if (key == "LD_LIBRARY_PATH" && !additional_lib_path.empty()) { + // Prepend the additional library path + env_string = key + "=" + additional_lib_path + ":" + value; + } else { + env_string = key + "=" + value; + } + env_strings.push_back(env_string); + } + + // Convert to char* array + for (auto& env_str : env_strings) { + env_array.push_back(const_cast(env_str.c_str())); + } + env_array.push_back(nullptr); + + return std::make_pair(std::move(env_strings), std::move(env_array)); +} + }}} // namespace triton::backend::python diff --git a/src/pb_utils.h b/src/pb_utils.h index 20f17795..fa315210 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -29,15 +29,21 @@ #ifdef TRITON_ENABLE_GPU #include #endif // TRITON_ENABLE_GPU -#include + #include #include +#include +#include +#include #include +#include #include #include #include #include +#include #include + #include "pb_exception.h" #include "shm_manager.h" #include "triton/backend/backend_common.h" @@ -69,14 +75,16 @@ namespace bi = boost::interprocess; TRITONSERVER_ErrorMessage(pb2_exception.what())); \ } \ } \ - while (false) - -#define THROW_IF_TRITON_ERROR(X) \ - do { \ - TRITONSERVER_Error* tie_err__ = (X); \ - if (tie_err__ != nullptr) { \ - throw PythonBackendException(TRITONSERVER_ErrorMessage(tie_err__)); \ - } \ + } while (false) + +#define THROW_IF_TRITON_ERROR(X) \ + do { \ + TRITONSERVER_Error* tie_err__ = (X); \ + if (tie_err__ != nullptr) { \ + auto error_message = std::string(TRITONSERVER_ErrorMessage(tie_err__)); \ + TRITONSERVER_ErrorDelete(tie_err__); \ + throw PythonBackendException(error_message); \ + } \ } while (false) #define THROW_IF_CUDA_ERROR(X) \ @@ -161,9 +169,14 @@ struct ResponseBatch : SendMessageBase { bool is_error_set; uint32_t response_size; + + // Indicates whether the response factory has been deleted or not. + bool is_response_factory_deleted = false; }; -enum LogLevel { INFO = 0, WARNING, ERROR, VERBOSE }; +enum LogLevel { kInfo = 0, kWarning, kError, kVerbose }; + +enum MetricKind { kCounter = 0, kGauge, kHistogram }; struct LogSendMessage : SendMessageBase { bi::managed_external_buffer::handle_t filename; @@ -172,11 +185,47 @@ struct LogSendMessage : SendMessageBase { LogLevel level; }; - struct CleanupMessage : SendMessageBase { void* id; }; +struct CancelBLSRequestMessage : SendMessageBase { + void* infer_payload_id; + bool is_cancelled; +}; + +struct IsCancelledMessage : SendMessageBase { + intptr_t response_factory_address; + intptr_t request_address; + bool is_cancelled; +}; + +struct CustomMetricsMessage : SendMessageBase { + bi::managed_external_buffer::handle_t message; + bool has_error; + bool is_error_set; + bi::managed_external_buffer::handle_t error; + // This field is specifically utilized when making the + // 'PYTHONSTUB_MetricRequestValue' request. It is used to hold the metric + // value after the Python backend calls the Triton C API to retrieve the + // metric value and pass it back to the stub process. + double value; + // This field is specifically utilized when making the + // 'PYTHONSTUB_MetricFamilyRequestNew' or 'PYTHONSTUB_MetricRequestNew' + // requests. It is used to hold the memory address of + // TRITONSERVER_MetricFamily' or 'TRITONSERVER_Metric' objects created in the + // Python backend and pass back to the stub process. + void* address; +}; + +struct ModelLoaderMessage : SendMessageBase { + bi::managed_external_buffer::handle_t message; + bool has_error; + bool is_error_set; + bi::managed_external_buffer::handle_t error; + bool is_model_ready; +}; + struct ResponseSenderBase { bi::interprocess_mutex mu; bi::interprocess_condition cv; @@ -191,26 +240,35 @@ struct ResponseSenderBase { struct ResponseSendMessage : ResponseSenderBase { bi::managed_external_buffer::handle_t response; - // GPU Buffers handle + // A shm handle to a GPUBuffersShm object. bi::managed_external_buffer::handle_t gpu_buffers_handle; - // GPU buffers count - uint32_t gpu_buffers_count; - uint32_t flags; }; struct RequestBatch { uint32_t batch_size; - // GPU Buffers handle + // A shm handle to a GPUBuffersShm object. bi::managed_external_buffer::handle_t gpu_buffers_handle; +}; - // GPU buffers count - uint32_t gpu_buffers_count; +struct MemoryReleaseMessage { + std::mutex mu; + std::condition_variable cv; + uint64_t id; + bool waiting_on_stub; }; #ifdef TRITON_ENABLE_GPU +struct CUDAMemPoolMessage : SendMessageBase { + cudaIpcMemHandle_t cuda_handle; + int32_t device_id; + bi::managed_external_buffer::handle_t error; + bool has_error; + bool is_error_set; +}; + class CUDAHandler { public: static CUDAHandler& getInstance() @@ -222,17 +280,25 @@ class CUDAHandler { private: std::mutex mu_; void* dl_open_handle_ = nullptr; + std::string error_str_; CUresult (*cu_pointer_get_attribute_fn_)( CUdeviceptr*, CUpointer_attribute, CUdeviceptr) = nullptr; CUresult (*cu_get_error_string_fn_)(CUresult, const char**) = nullptr; CUresult (*cu_init_fn_)(unsigned int) = nullptr; + CUresult (*cu_device_primary_ctx_get_state_fn_)( + CUdevice, unsigned int*, int*) = nullptr; CUDAHandler(); + + /// Check if a primary context has already been created for a device. + bool HasPrimaryContext(int device); ~CUDAHandler() noexcept(false); public: CUDAHandler(CUDAHandler const&) = delete; void operator=(CUDAHandler const&) = delete; bool IsAvailable(); + const std::string& GetErrorString() const { return error_str_; } + void ClearErrorString() { return error_str_.clear(); } void PointerGetAttribute( CUdeviceptr* start_address, CUpointer_attribute attr, CUdeviceptr device_ptr); @@ -240,12 +306,65 @@ class CUDAHandler { int64_t memory_type_id, cudaIpcMemHandle_t* cuda_mem_handle, void** data_ptr); void CloseCudaHandle(int64_t memory_type_id, void* data_ptr); + void* LoadSharedObject(const char* filename); + void* LocateSymbol(const char* symbol); + std::string LocateSymbolError(); + void CloseLibrary(); + + /// Set the device only if the primary context has already been created for + /// this device. Inspired from PyTorch's MaybeSetDevice. + /// \param device The cuda device index. + void MaybeSetDevice(int device); }; + + +/// A helper class to change the current device and restore the old context. The +/// old context will be restored only if the primary context for that device is +/// already created, otherwise the CUDA context will remain as the primary +/// context of 'device'. +class ScopedSetDevice { + public: + ScopedSetDevice(int device); + ~ScopedSetDevice(); + + private: + int device_; + int current_device_; +}; + +// Check if the data is allocated from the pool by the base address. +bool IsUsingCUDAPool( + std::unique_ptr& cuda_pool, int64_t memory_type_id, + void* data); + #endif // TRITON_ENABLE_GPU +// FIXME: [DLIS-6078]: We should not need this function. However, some paths are +// being retrieved from core that are not platform-agnostic. +void SanitizePath(std::string& path); + +// Invalid characters that are not allowed in user input +constexpr const char* INVALID_CHARS = ";|&$`<>()[]{}\\\"'*?~#!"; + +// Validate that an identifier (model name, region name, etc.) +bool IsValidIdentifier(const std::string& input); + +// Check if a file exists and is executable +bool IsExecutableFile(const std::string& filepath); + #ifndef TRITON_PB_STUB std::shared_ptr WrapTritonErrorInSharedPtr( TRITONSERVER_Error* error); #endif +std::string GenerateUUID(); + +// Environment handling utilities for Python activation scripts +std::map ParseActivationScript( + const std::string& activate_path); + +std::pair, std::vector> PrepareEnvironment( + const std::map& env_vars, + const std::string& additional_lib_path = ""); + }}} // namespace triton::backend::python diff --git a/src/python_be.cc b/src/python_be.cc index fa33dbae..c152e035 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,7 +25,12 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "python_be.h" +#include + +#include "correlation_id.h" +#include "gpu_buffers.h" #include "infer_payload.h" +#include "model_loader.h" #include "pb_log.h" namespace triton { namespace backend { namespace python { @@ -148,107 +153,6 @@ ModelInstanceState::SetErrorForResponseSendMessage( } } -void -ModelInstanceState::SendMessageAndReceiveResponse( - bi::managed_external_buffer::handle_t message, - bi::managed_external_buffer::handle_t& response, bool& restart, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - auto error = SendMessageToStub(message); - if (error != nullptr) { - restart = true; - RespondErrorToAllRequests( - TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - - return; - } - - bi::managed_external_buffer::handle_t response_message; - error = Stub()->ReceiveMessageFromStub(response_message); - if (error != nullptr) { - restart = true; - RespondErrorToAllRequests( - TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - - return; - } - - response = response_message; -} - -TRITONSERVER_Error* -ModelInstanceState::SendMessageToStub( - bi::managed_external_buffer::handle_t message) -{ - bool success = false; - while (!success) { - uint64_t timeout_miliseconds = 1000; - { - boost::posix_time::ptime timeout = - boost::get_system_time() + - boost::posix_time::milliseconds(timeout_miliseconds); - - bi::scoped_lock lock( - *(Stub()->HealthMutex()), timeout); - - // Check if lock has been acquired. - if (lock) { - Stub()->IpcControl()->stub_health = false; - } else { - // If it failed to obtain the lock, it means that the stub has been - // stuck or exited while holding the health mutex lock. - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex."); - } - } - - Stub()->StubMessageQueue()->Push( - message, timeout_miliseconds /* duration ms */, success); - - if (!success && !IsStubProcessAlive()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Stub process is not healthy."); - } - } - - return nullptr; // success -} - -void -ModelInstanceState::RespondErrorToAllRequests( - const char* message, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - for (uint32_t r = 0; r < request_count; ++r) { - if ((*responses)[r] == nullptr) - continue; - - std::string err_message = - std::string( - "Failed to process the request(s) for model instance '" + Name() + - "', message: ") + - message; - - TRITONSERVER_Error* err = - TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message.c_str()); - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - - (*responses)[r] = nullptr; - TRITONSERVER_ErrorDelete(err); - } -} - -void -ModelInstanceState::WaitForBLSRequestsToFinish() -{ - futures_.clear(); -} - bool ModelInstanceState::IsStubProcessAlive() { @@ -269,12 +173,12 @@ ModelInstanceState::IsStubProcessAlive() TRITONSERVER_Error* ModelInstanceState::SaveRequestsToSharedMemory( TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_inference_requests, + std::vector>& pb_infer_requests, AllocatedSharedMemory& request_batch, std::shared_ptr>& responses) { // Clear any existing items in the requests vector - pb_inference_requests.clear(); + pb_infer_requests.clear(); ModelState* model_state = reinterpret_cast(Model()); RETURN_IF_EXCEPTION( @@ -338,6 +242,9 @@ ModelInstanceState::SaveRequestsToSharedMemory( } else if (type == TRITONSERVER_PARAMETER_STRING) { std::string string = reinterpret_cast(vvalue); RETURN_IF_ERROR(parameters_json.AddString(name, string)); + } else if (type == TRITONSERVER_PARAMETER_DOUBLE) { + RETURN_IF_ERROR(parameters_json.AddDouble( + name, *(reinterpret_cast(vvalue)))); } else { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, @@ -355,33 +262,63 @@ ModelInstanceState::SaveRequestsToSharedMemory( const char* id; RETURN_IF_ERROR(TRITONBACKEND_RequestId(request, &id)); - uint64_t correlation_id; - RETURN_IF_ERROR( - TRITONBACKEND_RequestCorrelationId(request, &correlation_id)); + uint64_t correlation_id_uint = 0; + CorrelationId correlation_id; + + auto error = + TRITONBACKEND_RequestCorrelationId(request, &correlation_id_uint); + if (error != nullptr) { + TRITONSERVER_ErrorDelete(error); + const char* correlation_id_string = ""; + RETURN_IF_ERROR(TRITONBACKEND_RequestCorrelationIdString( + request, &correlation_id_string)); + correlation_id = CorrelationId(std::string(correlation_id_string)); + } else { + correlation_id = CorrelationId(correlation_id_uint); + } uint32_t flags; RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(request, &flags)); - std::unique_ptr infer_request; - if (model_state->IsDecoupled()) { - TRITONBACKEND_ResponseFactory* factory_ptr; - RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); - infer_request = std::make_unique( - id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), parameters_string, flags, - 0 /* BLS request timeout*/, reinterpret_cast(factory_ptr), - reinterpret_cast(request)); - } else { - infer_request = std::make_unique( - id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), parameters_string, flags, - 0 /* BLS request timeout*/, 0 /* response_factory_address */, - reinterpret_cast(request)); + // Do not return if error in this case, because Triton core + // will return an error if tracing is disabled (see PYBE PR#295). + // For the same reason, we do not log the error message, otherwise + // when Triton is compiled without tracing, it'll constantly log + // this error. + TRITONSERVER_InferenceTrace* triton_trace; + auto err = TRITONBACKEND_RequestTrace(request, &triton_trace); + if (err != nullptr) { + triton_trace = nullptr; + TRITONSERVER_ErrorDelete(err); + } + const char* val = nullptr; + if (triton_trace != nullptr) { + LOG_IF_ERROR( + TRITONSERVER_InferenceTraceContext(triton_trace, &val), + "failed to retrieve trace context"); } + std::string context = (val != nullptr) ? std::string(val) : ""; + InferenceTrace trace = + InferenceTrace(reinterpret_cast(triton_trace), context); + + uint64_t request_timeout; + RETURN_IF_ERROR(TRITONBACKEND_InferenceRequestTimeoutMicroseconds( + request, &request_timeout)); + + std::unique_ptr infer_request; + TRITONBACKEND_ResponseFactory* factory_ptr = nullptr; + RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); + + infer_request = std::make_unique( + id, correlation_id, pb_input_tensors, requested_output_names, + model_state->Name(), model_state->Version(), parameters_string, flags, + request_timeout, reinterpret_cast(factory_ptr), + reinterpret_cast(request), + PreferredMemory(PreferredMemory::kDefault, 0), trace); RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool())); requests_shm[r] = infer_request->ShmHandle(); - pb_inference_requests.emplace_back(std::move(infer_request)); + pb_infer_requests.emplace_back(std::move(infer_request)); } return nullptr; // success @@ -402,11 +339,6 @@ ModelInstanceState::LaunchStubProcess() thread_pool_ = std::make_unique( model_state->StateForBackend()->thread_pool_size); - if (model_state->IsDecoupled()) { - decoupled_thread_ = true; - decoupled_monitor_ = - std::thread(&ModelInstanceState::DecoupledMessageQueueMonitor, this); - } request_executor_ = std::make_unique( Stub()->ShmPool(), model_state->TritonServer()); @@ -460,7 +392,15 @@ ModelInstanceState::GetInputTensor( CUDAHandler& cuda_handler = CUDAHandler::getInstance(); // If CUDA driver API is not available, the input tensors will be moved to // CPU. - if (!cuda_handler.IsAvailable()) { + if (!cuda_handler.IsAvailable() && !cpu_only_tensors) { + if (!cuda_handler.GetErrorString().empty()) { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, (std::string( + "Forcing CPU only input tensors: " + + cuda_handler.GetErrorString())) + .c_str()); + } + cuda_handler.ClearErrorString(); cpu_only_tensors = true; } #endif @@ -499,16 +439,31 @@ ModelInstanceState::GetInputTensor( RETURN_IF_ERROR(backend::ReadInputTensor( request, input_name, input_buffer, &byte_size)); } + + if (input_dtype == TRITONSERVER_TYPE_BYTES) { + const char* content = reinterpret_cast(input_tensor->DataPtr()); + size_t content_byte_size = input_tensor->ByteSize(); + int64_t request_element_cnt = 0; + RETURN_IF_ERROR( + GetElementCount(input_tensor->Dims(), &request_element_cnt)); + RETURN_IF_ERROR(ValidateStringBuffer( + content, content_byte_size, request_element_cnt, input_name, + nullptr /* str_list */)); + } } else { #ifdef TRITON_ENABLE_GPU + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(src_memory_type_id); - // Retreiving GPU input tensors + // Retrieving GPU input tensors const void* buffer = nullptr; std::vector> alloc_perference; alloc_perference = {{TRITONSERVER_MEMORY_GPU, src_memory_type_id}}; // collector is used in the non-decoupled mode. if (collector) { + // The ProcessTensor function will try to allocate the buffer in the CUDA + // pool first. RETURN_IF_ERROR(collector->ProcessTensor( input_name, nullptr, 0, alloc_perference, reinterpret_cast(&buffer), &input_byte_size, @@ -548,10 +503,22 @@ ModelInstanceState::GetInputTensor( Stub()->ShmPool(), true /* copy_gpu */)); } } else { + // Try to use the cuda shared memory pool first. void* dev_ptr; - RETURN_IF_CUDA_ERROR( - cudaMalloc(&dev_ptr, input_byte_size), TRITONSERVER_ERROR_INTERNAL, - std::string("Failed to allocated CUDA memory")); + BackendMemory* backend_memory; + std::unique_ptr lbackend_memory; + RETURN_IF_ERROR(BackendMemory::Create( + reinterpret_cast( + Stub() + ->ShmPool() + ->GetCUDAMemoryPoolManager() + ->TritonMemoryManager()), + {BackendMemory::AllocationType::GPU_POOL, + BackendMemory::AllocationType::GPU}, + src_memory_type_id, input_byte_size, &backend_memory)); + + dev_ptr = backend_memory->MemoryPtr(); + lbackend_memory.reset(backend_memory); size_t byte_size = input_byte_size; @@ -574,14 +541,11 @@ ModelInstanceState::GetInputTensor( const_cast(dev_ptr), input_byte_size, nullptr /* DLManagedTensor */); + input_tensor->SetMemory(std::move( + PbMemory::Create(Stub()->ShmPool(), std::move(lbackend_memory)))); + RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( Stub()->ShmPool(), true /* copy_gpu */)); - - std::unique_ptr gpu_memory_record = - std::make_unique(input_tensor->Memory()->DataPtr()); - uint64_t memory_release_id = - Stub()->GetMemoryManager()->AddRecord(std::move(gpu_memory_record)); - input_tensor->Memory()->SetMemoryReleaseId(memory_release_id); } #else return TRITONSERVER_ErrorNew( @@ -623,17 +587,17 @@ ModelInstanceState::ExecuteBLSRequest( is_response_batch_set = true; bool has_gpu_tensor = false; + GPUBuffersHelper gpu_buffer_helper; PythonBackendException pb_exception(std::string{}); - - uint32_t gpu_buffers_count = 0; if (request_batch_shm_ptr->batch_size == 1) { std::shared_ptr infer_request; bi::managed_external_buffer::handle_t* request_handle = reinterpret_cast( request_batch.data_.get() + sizeof(RequestBatch)); infer_request = InferRequest::LoadFromSharedMemory( - Stub()->ShmPool(), *request_handle, false /* open_cuda_handle */); + Stub()->ShmPool(), *request_handle, false /* open_cuda_handle */, + nullptr /* is_model_decoupled */); // If the BLS inputs are in GPU an additional round trip between the // stub process and the main process is required. The reason is that we @@ -643,7 +607,8 @@ ModelInstanceState::ExecuteBLSRequest( for (auto& input_tensor : infer_request->Inputs()) { if (!input_tensor->IsCPU()) { #ifdef TRITON_ENABLE_GPU - gpu_buffers_count++; + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(input_tensor->MemoryTypeId()); BackendMemory* backend_memory; std::unique_ptr lbackend_memory; has_gpu_tensor = true; @@ -661,38 +626,24 @@ ModelInstanceState::ExecuteBLSRequest( lbackend_memory.reset(backend_memory); input_tensor->SetMemory(std::move(PbMemory::Create( Stub()->ShmPool(), std::move(lbackend_memory)))); + gpu_buffer_helper.AddBuffer(input_tensor->Memory()->ShmHandle()); #endif // TRITON_ENABLE_GPU } } } catch (const PythonBackendException& exception) { + gpu_buffer_helper.SetError(Stub()->ShmPool(), exception.what()); pb_exception = exception; } - AllocatedSharedMemory gpu_handles; // Wait for the extra round trip to complete. The stub process will fill // in the data for the GPU tensors. If there is an error, the extra round // trip must be still completed, otherwise the stub process will always be // waiting for a message from the parent process. if (has_gpu_tensor) { - try { - gpu_handles = Stub() - ->ShmPool() - ->Construct( - gpu_buffers_count); - request_batch_shm_ptr->gpu_buffers_count = gpu_buffers_count; - request_batch_shm_ptr->gpu_buffers_handle = gpu_handles.handle_; - size_t i = 0; - for (auto& input_tensor : infer_request->Inputs()) { - if (!input_tensor->IsCPU()) { - gpu_handles.data_.get()[i] = input_tensor->Memory()->ShmHandle(); - ++i; - } - } - } - catch (const PythonBackendException& exception) { - pb_exception = exception; - } + gpu_buffer_helper.Complete(Stub()->ShmPool()); + request_batch_shm_ptr->gpu_buffers_handle = + gpu_buffer_helper.ShmHandle(); bi::scoped_lock lock{ *(ipc_message->ResponseMutex())}; @@ -700,7 +651,7 @@ ModelInstanceState::ExecuteBLSRequest( ipc_message->ResponseCondition()->wait(lock); } - if (pb_exception.what() != nullptr) { + if (pb_exception.what() == std::string{""}) { auto callback = std::bind( &ModelInstanceState::SendBLSDecoupledResponse, this, std::placeholders::_1); @@ -714,7 +665,8 @@ ModelInstanceState::ExecuteBLSRequest( if (is_decoupled && (infer_response->Id() != nullptr)) { // Need to manage the lifetime of InferPayload object for bls // decoupled responses. - infer_payload_[reinterpret_cast(&infer_payload)] = + std::lock_guard lock(infer_payload_mu_); + infer_payload_[reinterpret_cast(infer_payload.get())] = infer_payload; } @@ -750,48 +702,6 @@ ModelInstanceState::ExecuteBLSRequest( } } -void -ModelInstanceState::DecoupledMessageQueueMonitor() -{ - while (decoupled_thread_) { - bi::managed_external_buffer::handle_t handle = - Stub()->ParentMessageQueue()->Pop(); - if (handle == DUMMY_MESSAGE) { - break; - } - std::unique_ptr message = - IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), handle); - - // Need to notify the model instance thread that the execute response has - // been received. - if (message->Command() == PYTHONSTUB_ExecuteResponse) { - std::lock_guard guard{mu_}; - received_message_ = std::move(message); - cv_.notify_one(); - } else if (message->Command() == PYTHONSTUB_ResponseSend) { - std::shared_ptr response_send_message = std::move(message); - std::packaged_task task([this, response_send_message] { - ResponseSendDecoupled(response_send_message); - }); - std::future future = - boost::asio::post(*thread_pool_, std::move(task)); - futures_.emplace_back(std::move(future)); - } else if ( - message->Command() == PYTHONSTUB_InferExecRequest || - message->Command() == PYTHONSTUB_InferStreamExecRequest) { - std::shared_ptr bls_execute = std::move(message); - std::packaged_task task([this, bls_execute] { - ExecuteBLSRequest( - bls_execute, - (bls_execute->Command() == PYTHONSTUB_InferStreamExecRequest)); - }); - std::future future = - boost::asio::post(*thread_pool_, std::move(task)); - futures_.emplace_back(std::move(future)); - } - } -} - void ModelInstanceState::StubToParentMQMonitor() { @@ -804,15 +714,72 @@ ModelInstanceState::StubToParentMQMonitor() std::unique_ptr message = IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), handle); - if (message->Command() == PYTHONSTUB_LogRequest) { - ProcessLogRequest(message); - } else if (message->Command() == PYTHONSTUB_CleanupRequest) { - ProcessBLSCleanupRequest(message); + switch (message->Command()) { + case PYTHONSTUB_LogRequest: { + ProcessLogRequest(message); + break; + } + case PYTHONSTUB_BLSDecoupledInferPayloadCleanup: + case PYTHONSTUB_DecoupledResponseFactoryCleanup: { + ProcessCleanupRequest(message); + break; + } + case PYTHONSTUB_IsRequestCancelled: { + ProcessIsRequestCancelled(message); + break; + } + case PYTHONSTUB_MetricFamilyRequestNew: + case PYTHONSTUB_MetricFamilyRequestDelete: { + ProcessMetricFamilyRequest(message); + break; + } + case PYTHONSTUB_MetricRequestNew: + case PYTHONSTUB_MetricRequestDelete: + case PYTHONSTUB_MetricRequestValue: + case PYTHONSTUB_MetricRequestIncrement: + case PYTHONSTUB_MetricRequestSet: + case PYTHONSTUB_MetricRequestObserve: { + ProcessMetricRequest(message); + break; + } + case PYTHONSTUB_ModelReadinessRequest: + case PYTHONSTUB_LoadModelRequest: + case PYTHONSTUB_UnloadModelRequest: { + ProcessModelControlRequest(message); + break; + } + case PYTHONSTUB_ResponseSend: { + std::shared_ptr response_send_message = std::move(message); + std::packaged_task task([this, response_send_message] { + ResponseSendDecoupled(response_send_message); + }); + boost::asio::post(*thread_pool_, std::move(task)); + break; + } + case PYTHONSTUB_InferExecRequest: + case PYTHONSTUB_InferStreamExecRequest: { + std::shared_ptr bls_execute = std::move(message); + std::packaged_task task([this, bls_execute] { + ExecuteBLSRequest( + bls_execute, + (bls_execute->Command() == PYTHONSTUB_InferStreamExecRequest)); + }); + boost::asio::post(*thread_pool_, std::move(task)); + break; + } + case PYTHONSTUB_CancelBLSInferRequest: { + ProcessCancelBLSRequest(message); + break; + } + default: { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, "Unexpected message type received."); + break; + } } } } - void ModelInstanceState::ProcessLogRequest( const std::unique_ptr& message) @@ -828,25 +795,25 @@ ModelInstanceState::ProcessLogRequest( LogLevel level = pb_log_message->Level(); switch (level) { - case LogLevel::INFO: { + case LogLevel::kInfo: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_INFO, (filename.c_str()), line, (log_message.c_str())); break; } - case LogLevel::WARNING: { + case LogLevel::kWarning: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_WARN, (filename.c_str()), line, (log_message.c_str())); break; } - case LogLevel::ERROR: { + case LogLevel::kError: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_ERROR, (filename.c_str()), line, (log_message.c_str())); break; } - case LogLevel::VERBOSE: { + case LogLevel::kVerbose: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_VERBOSE, (filename.c_str()), line, (log_message.c_str())); @@ -868,16 +835,24 @@ ModelInstanceState::ProcessLogRequest( } void -ModelInstanceState::ProcessBLSCleanupRequest( +ModelInstanceState::ProcessCleanupRequest( const std::unique_ptr& message) { AllocatedSharedMemory cleanup_request_message = Stub()->ShmPool()->Load(message->Args()); CleanupMessage* cleanup_message_ptr = reinterpret_cast(cleanup_request_message.data_.get()); - - void* id = cleanup_message_ptr->id; - infer_payload_.erase(id); + intptr_t id = reinterpret_cast(cleanup_message_ptr->id); + if (message->Command() == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { + // Remove the InferPayload object from the map. + std::lock_guard lock(infer_payload_mu_); + infer_payload_.erase(id); + } else if (message->Command() == PYTHONSTUB_DecoupledResponseFactoryCleanup) { + // Delete response factory + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + response_factory(reinterpret_cast(id)); + } { bi::scoped_lock lock{*(message->ResponseMutex())}; @@ -886,6 +861,305 @@ ModelInstanceState::ProcessBLSCleanupRequest( } } +void +ModelInstanceState::ProcessCancelBLSRequest( + const std::unique_ptr& message) +{ + AllocatedSharedMemory message_shm = + Stub()->ShmPool()->Load(message->Args()); + CancelBLSRequestMessage* message_payload = + reinterpret_cast(message_shm.data_.get()); + + { + bi::scoped_lock lk{message_payload->mu}; + + intptr_t id = reinterpret_cast(message_payload->infer_payload_id); + try { + { + std::lock_guard lock(infer_payload_mu_); + if (infer_payload_.find(id) != infer_payload_.end()) { + infer_payload_[id]->SafeCancelRequest(); + } + } + message_payload->is_cancelled = true; + } + catch (const PythonBackendException& pb_exception) { + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, pb_exception.what()); + } + + message_payload->waiting_on_stub = true; + message_payload->cv.notify_all(); + while (message_payload->waiting_on_stub) { + message_payload->cv.wait(lk); + } + } +} + +void +ModelInstanceState::ProcessIsRequestCancelled( + const std::unique_ptr& message) +{ + AllocatedSharedMemory message_shm = + Stub()->ShmPool()->Load(message->Args()); + IsCancelledMessage* message_payload = + reinterpret_cast(message_shm.data_.get()); + + { + bi::scoped_lock lk{message_payload->mu}; + + if (message_payload->response_factory_address != 0) { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + message_payload->response_factory_address); + TRITONBACKEND_ResponseFactoryIsCancelled( + response_factory, &message_payload->is_cancelled); + } else if (message_payload->request_address != 0) { + TRITONBACKEND_Request* request = reinterpret_cast( + message_payload->request_address); + TRITONBACKEND_RequestIsCancelled(request, &message_payload->is_cancelled); + } else { + throw PythonBackendException("Cannot determine request cancellation"); + } + + message_payload->waiting_on_stub = true; + message_payload->cv.notify_all(); + while (message_payload->waiting_on_stub) { + message_payload->cv.wait(lk); + } + } +} + +template +void +ModelInstanceState::ProcessMessage( + const std::unique_ptr& ipc_message, + std::function&, MessageType*)> request_handler) +{ + AllocatedSharedMemory message = + Stub()->ShmPool()->Load(ipc_message->Args()); + MessageType* message_ptr = + reinterpret_cast(message.data_.get()); + std::unique_ptr pb_error_message; + PythonBackendException pb_exception(std::string{}); + std::unique_ptr object = + T::LoadFromSharedMemory(Stub()->ShmPool(), message_ptr->message); + + ScopedDefer _([message_ptr] { + { + bi::scoped_lock guard{message_ptr->mu}; + message_ptr->waiting_on_stub = true; + message_ptr->cv.notify_all(); + while (message_ptr->waiting_on_stub) { + message_ptr->cv.wait(guard); + } + } + }); + + try { + request_handler(object, message_ptr); + } + catch (const PythonBackendException& exception) { + pb_exception = exception; + } + + if (pb_exception.what() != std::string{}) { + message_ptr->has_error = true; + LOG_IF_EXCEPTION( + pb_error_message = + PbString::Create(Stub()->ShmPool(), pb_exception.what())); + message_ptr->error = pb_error_message->ShmHandle(); + message_ptr->is_error_set = true; + } +} + +void +ModelInstanceState::ProcessMetricFamilyRequest( + const std::unique_ptr& message) +{ + auto command = message->Command(); + ProcessMessage( + message, [this, command]( + std::unique_ptr& metric_family, + CustomMetricsMessage* metrics_message_ptr) { + switch (command) { + case PYTHONSTUB_MetricFamilyRequestNew: { + metrics_message_ptr->address = + metric_family->InitializeTritonMetricFamily(); + break; + } + case PYTHONSTUB_MetricFamilyRequestDelete: { + metric_family->ClearTritonMetricFamily(); + break; + } + default: { + throw PythonBackendException("Unknown metric family request kind"); + } + } + }); +} + +void +ModelInstanceState::ProcessMetricRequest( + const std::unique_ptr& message) +{ + auto command = message->Command(); + ProcessMessage( + message, [this, command]( + std::unique_ptr& metric, + CustomMetricsMessage* metrics_message_ptr) { + try { + switch (command) { + case PYTHONSTUB_MetricRequestNew: { + metrics_message_ptr->address = metric->InitializeTritonMetric(); + break; + } + case PYTHONSTUB_MetricRequestIncrement: + case PYTHONSTUB_MetricRequestSet: + case PYTHONSTUB_MetricRequestObserve: + case PYTHONSTUB_MetricRequestValue: { + metric->HandleMetricOperation(metrics_message_ptr, command); + break; + } + case PYTHONSTUB_MetricRequestDelete: { + metric->ClearTritonMetric(); + break; + } + default: { + throw PythonBackendException("Unknown metric request kind"); + } + } + } + catch (const PythonBackendException& exception) { + throw exception; + } + }); +} + +void +ModelInstanceState::ProcessModelControlRequest( + const std::unique_ptr& message) +{ + auto command = message->Command(); + ModelState* model_state = reinterpret_cast(Model()); + ProcessMessage( + message, [this, command, model_state]( + std::unique_ptr& model_loader, + ModelLoaderMessage* model_loader_msg_ptr) { + switch (command) { + case PYTHONSTUB_LoadModelRequest: { + model_loader->LoadModel(model_state->TritonServer()); + break; + } + case PYTHONSTUB_UnloadModelRequest: { + model_loader->UnloadModel(model_state->TritonServer()); + break; + } + case PYTHONSTUB_ModelReadinessRequest: { + model_loader_msg_ptr->is_model_ready = + model_loader->IsModelReady(model_state->TritonServer()); + break; + } + default: { + throw PythonBackendException("Unknown model loader request kind"); + } + } + }); +} + +TRITONSERVER_Error* +ModelInstanceState::SendMessageToStub( + bi::managed_external_buffer::handle_t message) +{ + bool success = false; + while (!success) { + uint64_t timeout_miliseconds = 1000; + { + boost::posix_time::ptime timeout = + boost::get_system_time() + + boost::posix_time::milliseconds(timeout_miliseconds); + + bi::scoped_lock lock( + *(Stub()->HealthMutex()), timeout); + + // Check if lock has been acquired. + if (lock) { + Stub()->IpcControl()->stub_health = false; + } else { + // If it failed to obtain the lock, it means that the stub has been + // stuck or exited while holding the health mutex lock. + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex."); + } + } + + Stub()->StubMessageQueue()->Push( + message, timeout_miliseconds /* duration ms */, success); + + if (!success && !IsStubProcessAlive()) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, "Stub process is not healthy."); + } + } + + return nullptr; // success +} + +void +ModelInstanceState::SendMessageAndReceiveResponse( + bi::managed_external_buffer::handle_t message, + bi::managed_external_buffer::handle_t& response, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count) +{ + auto error = SendMessageToStub(message); + if (error != nullptr) { + RespondErrorToAllRequests( + TRITONSERVER_ErrorMessage(error), responses, requests, request_count); + + return; + } + + bi::managed_external_buffer::handle_t response_message; + error = Stub()->ReceiveMessageFromStub(response_message); + if (error != nullptr) { + RespondErrorToAllRequests( + TRITONSERVER_ErrorMessage(error), responses, requests, request_count); + + return; + } + + response = response_message; +} + +void +ModelInstanceState::RespondErrorToAllRequests( + const char* message, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count) +{ + for (uint32_t r = 0; r < request_count; ++r) { + if ((*responses)[r] == nullptr) + continue; + + std::string err_message = + std::string( + "Failed to process the request(s) for model instance '" + Name() + + "', message: ") + + message; + + TRITONSERVER_Error* err = + TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message.c_str()); + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + "failed sending response"); + + (*responses)[r] = nullptr; + TRITONSERVER_ErrorDelete(err); + } +} + + void ModelInstanceState::StartMonitor() { @@ -899,6 +1173,7 @@ ModelInstanceState::TerminateMonitor() { if (stub_to_parent_thread_) { stub_to_parent_thread_ = false; + // Push a dummy message to signal the thread to terminate. Stub()->StubToParentMessageQueue()->Push(DUMMY_MESSAGE); stub_to_parent_queue_monitor_.join(); } @@ -915,6 +1190,17 @@ ModelInstanceState::ResponseSendDecoupled( ResponseSendMessage* send_message_payload = reinterpret_cast(send_message.data_.get()); std::unique_ptr error_message; + ScopedDefer response_factory_deleter([send_message_payload] { + if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + send_message_payload->response_factory_address); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + lresponse_factory(reinterpret_cast( + response_factory)); + } + }); ScopedDefer _([send_message_payload] { { bi::scoped_lock guard{send_message_payload->mu}; @@ -931,8 +1217,10 @@ ModelInstanceState::ResponseSendDecoupled( reinterpret_cast( send_message_payload->response_factory_address); if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - std::lock_guard guard{closed_requests_mutex_}; - closed_requests_.push_back(send_message_payload->request_address); + { + std::lock_guard guard{closed_requests_mutex_}; + closed_requests_.push_back(send_message_payload->request_address); + } } if (send_message_payload->response != 0) { @@ -942,32 +1230,33 @@ ModelInstanceState::ResponseSendDecoupled( false /* open cuda ipc handle */); bool requires_deferred_callback = false; - std::vector, void*>> gpu_output_buffers; - std::shared_ptr error = infer_response->Send( - response_factory, CudaStream(), requires_deferred_callback, - send_message_payload->flags, Stub()->ShmPool(), gpu_output_buffers); - SetErrorForResponseSendMessage(send_message_payload, error, error_message); + TRITONBACKEND_Response* response; + SetErrorForResponseSendMessage( + send_message_payload, + WrapTritonErrorInSharedPtr( + TRITONBACKEND_ResponseNewFromFactory(&response, response_factory)), + error_message); - if (requires_deferred_callback) { - AllocatedSharedMemory gpu_buffers_handle = - Stub()->ShmPool()->Construct( - sizeof(uint64_t) + - gpu_output_buffers.size() * - sizeof(bi::managed_external_buffer::handle_t)); - uint64_t* gpu_buffer_count = - reinterpret_cast(gpu_buffers_handle.data_.get()); - *gpu_buffer_count = gpu_output_buffers.size(); - bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = - reinterpret_cast( - gpu_buffers_handle.data_.get() + sizeof(uint64_t)); - send_message_payload->gpu_buffers_handle = gpu_buffers_handle.handle_; + std::vector, void*>> gpu_output_buffers; + GPUBuffersHelper gpu_buffer_helper; - size_t index = 0; - for (auto& output_buffer_pair : gpu_output_buffers) { - std::unique_ptr& pb_memory = output_buffer_pair.first; - gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); - ++index; +#ifdef TRITON_ENABLE_GPU + for (auto& output_tensor : infer_response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); } + } +#endif // TRITON_ENABLE_GPU + + infer_response->Send( + response, CudaStream(), requires_deferred_callback, + send_message_payload->flags, Stub()->ShmPool(), gpu_buffer_helper, + gpu_output_buffers); + + if (requires_deferred_callback) { + gpu_buffer_helper.Complete(Stub()->ShmPool()); + send_message_payload->gpu_buffers_handle = gpu_buffer_helper.ShmHandle(); // Additional round trip so that the stub can fill the GPU output buffers. { @@ -980,29 +1269,55 @@ ModelInstanceState::ResponseSendDecoupled( } } - index = 0; bool cuda_copy = false; for (auto& output_buffer_pair : gpu_output_buffers) { auto& pb_memory = output_buffer_pair.first; + void* pointer = output_buffer_pair.second; + bool cuda_used; - if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { - bool cuda_used; - void* pointer = output_buffer_pair.second; - - CopyBuffer( - "Failed to copy the output tensor to buffer.", - TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, - pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, - CudaStream(), &cuda_used); - cuda_copy |= cuda_used; - } - gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); - ++index; + try { + if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { + THROW_IF_TRITON_ERROR(CopyBuffer( + "Failed to copy the CPU output tensor to buffer.", + TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } else if ( + (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) && + pb_memory->UseCUDASharedPool() && + (pb_memory->DataPtr() != pointer)) { + // If the data pointer from pb_memory is not the same as the + // pointer, it means that the Triton-provided buffer is not used + // during tensor transfer. Instead, an intermediate buffer that uses + // CUDA shared memory pool is used. In this case, we need to copy + // the data from the intermediate buffer back to the Triton-provided + // buffer. + THROW_IF_TRITON_ERROR(CopyBuffer( + "Failed to copy the GPU output tensor to buffer.", + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } #ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - } + if (cuda_copy) { + cudaStreamSynchronize(stream_); + } #endif // TRITON_ENABLE_GPU + } + catch (const PythonBackendException& pb_exception) { + TRITONSERVER_Error* error = TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string( + "Failed to copy output tensor to Triton-provided buffer: ") + + pb_exception.what()) + .c_str()); + SetErrorForResponseSendMessage( + send_message_payload, WrapTritonErrorInSharedPtr(error), + error_message); + } } } } else { @@ -1010,20 +1325,13 @@ ModelInstanceState::ResponseSendDecoupled( response_factory, send_message_payload->flags); SetErrorForResponseSendMessage( send_message_payload, WrapTritonErrorInSharedPtr(error), error_message); - - if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - std::unique_ptr< - TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> - response_factory(reinterpret_cast( - send_message_payload->response_factory_address)); - } } } TRITONSERVER_Error* -ModelInstanceState::ProcessRequestsDecoupled( +ModelInstanceState::ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_inference_requests, + std::vector>& pb_infer_requests, PbMetricReporter& reporter) { NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); @@ -1049,8 +1357,7 @@ ModelInstanceState::ProcessRequestsDecoupled( std::shared_ptr> responses; RETURN_IF_ERROR(SaveRequestsToSharedMemory( - requests, request_count, pb_inference_requests, request_batch, - responses)); + requests, request_count, pb_infer_requests, request_batch, responses)); uint64_t compute_start_ns = 0; SET_TIMESTAMP(compute_start_ns); @@ -1062,27 +1369,48 @@ ModelInstanceState::ProcessRequestsDecoupled( IPCMessage::Create(Stub()->ShmPool(), false /*inline_response*/)); ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; ipc_message->Args() = request_batch.handle_; - received_message_ = nullptr; - ScopedDefer _([this] { Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); }); + ScopedDefer execute_finalize([this] { + // Push a dummy message to signal the thread to terminate. + Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); + }); + + std::unique_ptr response; { - std::unique_lock guard{mu_}; Stub()->StubMessageQueue()->Push(ipc_message->ShmHandle()); - cv_.wait(guard, [this] { return received_message_ != nullptr; }); + bi::managed_external_buffer::handle_t response_message; + RETURN_IF_ERROR(Stub()->ReceiveMessageFromStub(response_message)); + response = + IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), response_message); } - - AllocatedSharedMemory response_batch = - Stub()->ShmPool()->Load(received_message_->Args()); + char* ipc_message_shm = + reinterpret_cast(response->GetAllocatedSharedMemory().data_.get()); + ResponseBatch* response_batch_shm_ptr = + reinterpret_cast(ipc_message_shm + sizeof(IPCMessageShm)); uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); reporter.SetComputeEndNs(compute_end_ns); - reporter.SetBatchStatistics(request_count); + reporter.SetBatchStatistics(total_batch_size); - if (response_batch.data_->has_error) { - if (response_batch.data_->is_error_set) { + if (response_batch_shm_ptr->has_error) { + // Clean up the response factory if an error occurred. The + // `is_response_factory_deleted` flag indicates whether the response factory + // has been deleted for some corner cases. + if (!response_batch_shm_ptr->is_response_factory_deleted) { + for (uint32_t r = 0; r < request_count; r++) { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + pb_infer_requests[r]->GetResponseFactoryAddress()); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + lresponse_factory(reinterpret_cast( + response_factory)); + } + } + if (response_batch_shm_ptr->is_error_set) { auto error = PbString::LoadFromSharedMemory( - Stub()->ShmPool(), response_batch.data_->error); + Stub()->ShmPool(), response_batch_shm_ptr->error); return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, error->String().c_str()); } @@ -1091,208 +1419,100 @@ ModelInstanceState::ProcessRequestsDecoupled( TRITONSERVER_ERROR_INTERNAL, "Failed to process the requests."); } - return nullptr; // success -} - -void -ModelInstanceState::ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - bool& restart) -{ - NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); - ModelState* model_state = reinterpret_cast(Model()); - std::string name = model_state->Name(); - - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("model ") + model_state->Name() + ", instance " + Name() + - ", executing " + std::to_string(request_count) + " requests") - .c_str()); - - uint64_t exec_start_ns = 0; - SET_TIMESTAMP(exec_start_ns); - - // We take the responsibility of the responses. - std::shared_ptr> responses( - new std::vector()); - responses->reserve(request_count); - PbMetricReporter reporter( - TritonModelInstance(), requests, request_count, responses); - reporter.SetExecStartNs(exec_start_ns); - - for (size_t i = 0; i < request_count; i++) { - TRITONBACKEND_Response* response; - auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); - if (err == nullptr) { - responses->emplace_back(response); - } else { - responses->emplace_back(nullptr); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); - TRITONSERVER_ErrorDelete(err); - } - } - - size_t total_batch_size = 0; - RESPOND_ALL_AND_RETURN_IF_ERROR( - responses, request_count, - CheckIncomingRequests(requests, request_count, total_batch_size)); - - // No request to process - if (total_batch_size == 0) { - return; - } - - // Wait for all the pending BLS requests to be completed. - ScopedDefer bls_defer([this] { WaitForBLSRequestsToFinish(); }); - std::vector> pb_inference_requests; - AllocatedSharedMemory request_batch; - RESPOND_ALL_AND_RETURN_IF_ERROR( - responses, request_count, - SaveRequestsToSharedMemory( - requests, request_count, pb_inference_requests, request_batch, - responses)); - - std::shared_ptr ipc_message = - IPCMessage::Create(Stub()->ShmPool(), false /*inline_response*/); - ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; - ipc_message->Args() = request_batch.handle_; - - uint64_t compute_start_ns = 0; - SET_TIMESTAMP(compute_start_ns); - reporter.SetComputeStartNs(compute_start_ns); - - // This means that the stub process has exited and Python - // backend failed to restart the stub process. - if (Stub()->StubPid() == 0) { - const char* error_message = "The stub process has exited unexpectedly."; - RespondErrorToAllRequests( - error_message, responses, requests, request_count); - return; - } - - bi::managed_external_buffer::handle_t response_message; - { - NVTX_RANGE(nvtx_, "StubProcessing " + Name()); - SendMessageAndReceiveResponse( - ipc_message->ShmHandle(), response_message, restart, responses, - requests, request_count); - } - - ScopedDefer execute_finalize([this, &restart] { - // Push a dummy message to the message queue so that - // the stub process is notified that it can release - // the object stored in shared memory. - NVTX_RANGE(nvtx_, "RequestExecuteFinalize " + Name()); - if (!restart) - Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); - }); - if (restart) { - return; - } - - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - ipc_message = IPCMessage::LoadFromSharedMemory( - Stub()->ShmPool(), response_message)); - - // If the stub command is no longer PYTHONSTUB_InferExecRequest, it indicates - // that inference request exeuction has finished and there are no more BLS - // requests to execute. Otherwise, the Python backend will continuosly execute - // BLS requests pushed to the message queue. - while (ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest || - ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest) { - std::packaged_task task([this, ipc_message] { - ExecuteBLSRequest( - ipc_message, - (ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest)); - }); - std::future future = - boost::asio::post(*thread_pool_, std::move(task)); - futures_.emplace_back(std::move(future)); - - auto error = Stub()->ReceiveMessageFromStub(response_message); - if (error != nullptr) { - restart = true; - RespondErrorToAllRequests( - TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - return; - } - - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - ipc_message = IPCMessage::LoadFromSharedMemory( - Stub()->ShmPool(), response_message)); - } - - uint64_t compute_end_ns = 0; - SET_TIMESTAMP(compute_end_ns); - reporter.SetComputeEndNs(compute_end_ns); - - // Parsing the request response - AllocatedSharedMemory response_batch; - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - response_batch = Stub()->ShmPool()->Load(ipc_message->Args())); - - ResponseBatch* response_batch_shm_ptr = - reinterpret_cast(response_batch.data_.get()); - - // If inference fails, release all the requests and send an error response. - // If inference fails at this stage, it usually indicates a bug in the model - // code - if (response_batch_shm_ptr->has_error) { - if (response_batch_shm_ptr->is_error_set) { - std::unique_ptr error_message_shm; - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - error_message_shm = PbString::LoadFromSharedMemory( - Stub()->ShmPool(), response_batch_shm_ptr->error)); - RespondErrorToAllRequests( - error_message_shm->String().c_str(), responses, requests, - request_count); - } else { - const char* error_message = - "Failed to fetch the error in response batch."; - RespondErrorToAllRequests( - error_message, responses, requests, request_count); + if (response_batch_shm_ptr->batch_size > 0) { + bi::managed_external_buffer::handle_t* response_shm_handle = + reinterpret_cast( + ipc_message_shm + sizeof(ResponseBatch) + sizeof(IPCMessageShm)); + + std::shared_ptr> responses( + new std::vector()); + responses->reserve(request_count); + for (size_t i = 0; i < request_count; i++) { + // It is possible to have multiple responses batched together in a single + // response batch shm, where some of the responses are None due to the + // usage of response sender, so only create a TRITONBACKEND_Response + // object for the valid responses. + if (response_shm_handle[i] == 0) { + responses->emplace_back(nullptr); + } else { + TRITONBACKEND_Response* response; + auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); + if (err == nullptr) { + responses->emplace_back(response); + } else { + responses->emplace_back(nullptr); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); + TRITONSERVER_ErrorDelete(err); + } + } } - return; - } - bi::managed_external_buffer::handle_t* response_shm_handle = - reinterpret_cast( - response_batch.data_.get() + sizeof(ResponseBatch)); + std::vector requires_deferred_callback; - // If the output provided by the model is in GPU, we will pass the list of - // buffers provided by Triton to the stub process. - bool has_gpu_output = false; - std::vector requires_deferred_callback; + bool has_gpu_output = false; + std::vector> shm_responses; + std::vector, void*>>> + gpu_output_buffers(request_count); + GPUBuffersHelper gpu_buffer_helper; - std::vector> shm_responses; - std::vector, void*>>> - gpu_output_buffers(request_count); + for (uint32_t r = 0; r < request_count; ++r) { + NVTX_RANGE(nvtx_, "LoadingResponse " + Name()); + requires_deferred_callback.push_back(false); + if (response_shm_handle[r] == 0) { + continue; + } + TRITONBACKEND_Response* response = (*responses)[r]; + TRITONBACKEND_Request* request = requests[r]; + uint32_t requested_output_count = 0; - for (uint32_t r = 0; r < request_count; ++r) { - NVTX_RANGE(nvtx_, "LoadingResponse " + Name()); - TRITONBACKEND_Response* response = (*responses)[r]; - TRITONBACKEND_Request* request = requests[r]; - uint32_t requested_output_count = 0; - requires_deferred_callback.push_back(false); + shm_responses.emplace_back(nullptr); + std::unique_ptr& infer_response = shm_responses.back(); + try { + if (pb_infer_requests[r]->ReleaseFlags() == + TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + // For rescheduled requests, we do not need to send a response. + LOG_IF_ERROR( + TRITONBACKEND_ResponseDelete((*responses)[r]), + "failed to delete response"); + (*responses)[r] = nullptr; + continue; + } + { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + pb_infer_requests[r]->GetResponseFactoryAddress()); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + lresponse_factory( + reinterpret_cast( + response_factory)); + } + infer_response = InferResponse::LoadFromSharedMemory( + Stub()->ShmPool(), response_shm_handle[r], + false /* open_cuda_handle */); + if (infer_response->HasError()) { + TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( + infer_response->Error()->Code(), + infer_response->Error()->Message().c_str()); - shm_responses.emplace_back(nullptr); - std::unique_ptr& infer_response = shm_responses.back(); - try { - infer_response = InferResponse::LoadFromSharedMemory( - Stub()->ShmPool(), response_shm_handle[r], - false /* open_cuda_handle */); - if (infer_response->HasError()) { + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + "failed sending response"); + TRITONSERVER_ErrorDelete(err); + (*responses)[r] = nullptr; + + // Reset the release flags for the request. + pb_infer_requests[r]->SetReleaseFlags( + TRITONSERVER_REQUEST_RELEASE_ALL); + + // If has_error is true, we do not look at the response tensors. + continue; + } + } + catch (const PythonBackendException& pb_exception) { TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - infer_response->Error()->Message().c_str()); - + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); LOG_IF_ERROR( TRITONBACKEND_ResponseSend( (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), @@ -1300,136 +1520,118 @@ ModelInstanceState::ProcessRequests( TRITONSERVER_ErrorDelete(err); (*responses)[r] = nullptr; - // If has_error is true, we do not look at the response tensors. + // Reset the release flags for the request. + pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); + continue; } - } - catch (const PythonBackendException& pb_exception) { - TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - TRITONSERVER_ErrorDelete(err); - (*responses)[r] = nullptr; - continue; - } - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_RequestOutputCount(request, &requested_output_count)); - - std::set requested_output_names; - for (size_t j = 0; j < requested_output_count; ++j) { - const char* output_name; GUARDED_RESPOND_IF_ERROR( responses, r, - TRITONBACKEND_RequestOutputName(request, j, &output_name)); - requested_output_names.insert(output_name); - } - - bool require_deferred_callback = false; - - gpu_output_buffers[r] = - std::vector, void*>>{}; - std::shared_ptr error = infer_response->Send( - nullptr, CudaStream(), require_deferred_callback, - TRITONSERVER_RESPONSE_COMPLETE_FINAL, Stub()->ShmPool(), - gpu_output_buffers[r], requested_output_names, response); - GUARDED_RESPOND_IF_ERROR(responses, r, *error); + TRITONBACKEND_RequestOutputCount(request, &requested_output_count)); + std::set requested_output_names; + for (size_t j = 0; j < requested_output_count; ++j) { + const char* output_name; + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_RequestOutputName(request, j, &output_name)); + requested_output_names.insert(output_name); + } - requires_deferred_callback[r] = require_deferred_callback; + bool require_deferred_callback = false; - // Error object will be deleted by the GUARDED_RESPOND macro - *error = nullptr; - error.reset(); - if (requires_deferred_callback[r]) { - has_gpu_output = true; - } - } +#ifdef TRITON_ENABLE_GPU + for (auto& output_tensor : infer_response->OutputTensors()) { + if (output_tensor->MemoryType() == TRITONSERVER_MEMORY_GPU) { + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); + } + } +#endif // TRITON_ENABLE_GPU - // Finalize the execute. - execute_finalize.Complete(); + gpu_output_buffers[r] = + std::vector, void*>>{}; + infer_response->Send( + response, CudaStream(), require_deferred_callback, + TRITONSERVER_RESPONSE_COMPLETE_FINAL, Stub()->ShmPool(), + gpu_buffer_helper, gpu_output_buffers[r], requested_output_names); - // If the output tensor is in GPU, there will be a second round trip - // required for filling the GPU buffers provided by the main process. - if (has_gpu_output) { - size_t total_gpu_buffers_count = 0; - for (auto& gpu_output_buffer : gpu_output_buffers) { - total_gpu_buffers_count += gpu_output_buffer.size(); - } - AllocatedSharedMemory gpu_buffers_handle = - Stub()->ShmPool()->Construct( - sizeof(uint64_t) + - total_gpu_buffers_count * - sizeof(bi::managed_external_buffer::handle_t)); - uint64_t* gpu_buffer_count = - reinterpret_cast(gpu_buffers_handle.data_.get()); - *gpu_buffer_count = total_gpu_buffers_count; - bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = - reinterpret_cast( - gpu_buffers_handle.data_.get() + sizeof(uint64_t)); + requires_deferred_callback[r] = require_deferred_callback; - size_t index = 0; - for (auto& gpu_output_buffer : gpu_output_buffers) { - for (auto& buffer_memory_pair : gpu_output_buffer) { - gpu_buffers_handle_shm[index] = buffer_memory_pair.first->ShmHandle(); - ++index; + if (requires_deferred_callback[r]) { + has_gpu_output = true; } } - ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; - ipc_message->Args() = gpu_buffers_handle.handle_; - SendMessageAndReceiveResponse( - ipc_message->ShmHandle(), response_message, restart, responses, - requests, 0); + execute_finalize.Complete(); - bool cuda_copy = false; + // If the output tensor is in GPU, there will be a second round trip + // required for filling the GPU buffers provided by the main process. + if (has_gpu_output) { + ipc_message->Command() = + PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; + gpu_buffer_helper.Complete(Stub()->ShmPool()); + ipc_message->Args() = gpu_buffer_helper.ShmHandle(); + bi::managed_external_buffer::handle_t response_message; + SendMessageAndReceiveResponse( + ipc_message->ShmHandle(), response_message, responses, requests, 0); - index = 0; - uint32_t response_index = 0; - for (auto& gpu_output_buffer : gpu_output_buffers) { - for (auto& buffer_memory_pair : gpu_output_buffer) { - auto& pb_memory = buffer_memory_pair.first; - if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { - bool cuda_used = false; + bool cuda_copy = false; + + uint32_t response_index = 0; + for (auto& gpu_output_buffer : gpu_output_buffers) { + for (auto& buffer_memory_pair : gpu_output_buffer) { + auto& pb_memory = buffer_memory_pair.first; void* pointer = buffer_memory_pair.second; + bool cuda_used = false; - GUARDED_RESPOND_IF_ERROR( - responses, response_index, - CopyBuffer( - "Failed to copy the output tensor to buffer.", - TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, - pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, - CudaStream(), &cuda_used)); - cuda_copy |= cuda_used; + if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { + GUARDED_RESPOND_IF_ERROR( + responses, response_index, + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } else if ( + (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) && + pb_memory->UseCUDASharedPool() && + (pb_memory->DataPtr() != pointer)) { + // If the data pointer from pb_memory is not the same as the + // pointer, it means that the Triton-provided buffer is not used + // during tensor transfer. Instead, an intermediate buffer that uses + // CUDA shared memory pool is used. In this case, we need to copy + // the data from the intermediate buffer back to the Triton-provided + // buffer. + GUARDED_RESPOND_IF_ERROR( + responses, response_index, + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } } - gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); - ++index; - } - response_index++; + response_index++; #ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - } + if (cuda_copy) { + cudaStreamSynchronize(stream_); + } #endif // TRITON_ENABLE_GPU + } } - } - bls_defer.Complete(); - for (uint32_t r = 0; r < request_count; ++r) { - if (requires_deferred_callback[r]) { - shm_responses[r]->DeferredSendCallback(); + for (uint32_t r = 0; r < request_count; ++r) { + if (requires_deferred_callback[r]) { + shm_responses[r]->DeferredSendCallback(); + } } } - uint64_t exec_end_ns = 0; - SET_TIMESTAMP(exec_end_ns); - reporter.SetExecEndNs(exec_end_ns); - reporter.SetBatchStatistics(total_batch_size); - - return; + return nullptr; // success } void @@ -1460,16 +1662,36 @@ ModelInstanceState::PrepareResponseHandle( std::unique_ptr* infer_response, bi::managed_external_buffer::handle_t* response_handle) { +#ifdef TRITON_ENABLE_GPU + for (auto& output_tensor : (*infer_response)->OutputTensors()) { + if (!output_tensor->IsCPU()) { + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); + // It's possible that the CUDA memory pool offset isn't set correctly, + // even if the BLS output is using CUDA memory. This can occur when the + // CUDA memory pool hasn't been shared with the stub process at the time + // the BLS output is allocated during the ResponseAlloc callback. In such + // cases, we need to adjust the CUDA pool offset accordingly. + if (!output_tensor->Memory()->UseCUDASharedPool()) { + output_tensor->Memory()->UpdateCUDAOffset( + Stub()->ShmPool()->GetCUDAMemoryPoolManager()); + } + } + } +#endif // TRITON_ENABLE_GPU + (*infer_response)->SaveToSharedMemory(Stub()->ShmPool()); + for (auto& output_tensor : (*infer_response)->OutputTensors()) { - // For GPU tensors we need to store the memory release id in - // memory manager. if (!output_tensor->IsCPU()) { #ifdef TRITON_ENABLE_GPU - std::unique_ptr gpu_memory_record = - std::make_unique(output_tensor->Memory()->DataPtr()); + std::unique_ptr memory_record; + // Need to transfer the ownership of the BackendMemory to the + // MemoryManager so that the lifetime of the BackendMemory is managed. + memory_record = std::make_unique( + output_tensor->Memory()->GetBackendMemory()); uint64_t memory_release_id = - Stub()->GetMemoryManager()->AddRecord(std::move(gpu_memory_record)); + Stub()->GetMemoryManager()->AddRecord(std::move(memory_record)); output_tensor->Memory()->SetMemoryReleaseId(memory_release_id); #endif } @@ -1493,6 +1715,7 @@ ModelInstanceState::SendBLSDecoupledResponse( ipc_message = IPCMessage::Create(Stub()->ShmPool(), true /* inline_response */); ipc_message->Args() = response_batch_shm.handle_; + ipc_message->Command() = PYTHONSTUB_InferStreamExecResponse; PrepareResponseBatch( &response_batch, response_batch_shm, &ipc_message, &response_handle); is_response_batch_set = true; @@ -1515,15 +1738,6 @@ ModelInstanceState::SendBLSDecoupledResponse( } } - ScopedDefer _([&ipc_message, response_batch] { - { - bi::scoped_lock lock{ - *(ipc_message->ResponseMutex())}; - response_batch->waiting_on_stub = false; - ipc_message->ResponseCondition()->notify_all(); - } - }); - { bi::scoped_lock lock{ *(ipc_message->ResponseMutex())}; @@ -1534,16 +1748,28 @@ ModelInstanceState::SendBLSDecoupledResponse( } } +void +ModelInstanceState::ShareCUDAMemoryPool(const int32_t device_id) +{ +#ifdef TRITON_ENABLE_GPU + try { + Stub()->ShareCUDAMemoryPool(Model()->TritonMemoryManager(), device_id); + } + catch (const PythonBackendException& ex) { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("Failed to share CUDA memory pool with stub process: ") + + ex.what() + ". Will use CUDA IPC.") + .c_str()); + } +#endif // TRITON_ENABLE_GPU +} + ModelInstanceState::~ModelInstanceState() { - ModelState* model_state = reinterpret_cast(Model()); Stub()->UpdateHealth(); if (Stub()->IsHealthy()) { - if (model_state->IsDecoupled()) { - futures_.clear(); - Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); - decoupled_monitor_.join(); - } + // Wait for all the pending tasks to finish. thread_pool_->wait(); } // Terminate stub first to allow any last messages to be received by the back @@ -1551,7 +1777,6 @@ ModelInstanceState::~ModelInstanceState() Stub()->TerminateStub(); TerminateMonitor(); Stub()->ClearQueues(); - received_message_.reset(); Stub().reset(); } @@ -1606,6 +1831,8 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) void* bstate; THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_BackendState(backend, &bstate)); backend_state_ = reinterpret_cast(bstate); + + runtime_modeldir_ = backend_state_->runtime_modeldir; triton::common::TritonJson::Value params; common::TritonJson::Value model_config; if (model_config_.Find("parameters", ¶ms)) { @@ -1717,9 +1944,33 @@ ModelState::ValidateModelConfig() return nullptr; } +TRITONSERVER_Error* +ModelState::SetModelConfig() +{ + BackendModel::SetModelConfig(); + // `Update model_transaction_policy` if setting was set + // with `set_model_transaction_policy` + triton::common::TritonJson::Value model_transaction_policy; + bool is_decoupled = false; + if (ModelConfig().Find( + "model_transaction_policy", &model_transaction_policy)) { + triton::common::TritonJson::Value decoupled; + if (model_transaction_policy.Find("decoupled", &decoupled)) { + auto error = decoupled.AsBool(&is_decoupled); + if (error != nullptr) { + throw BackendModelException(error); + } + SetDecoupled(is_decoupled); + } + } + + return nullptr; +} + + extern "C" { -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) { const char* cname; @@ -1763,14 +2014,16 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) std::unique_ptr backend_state(new BackendState()); triton::common::TritonJson::Value cmdline; - backend_state->shm_default_byte_size = 64 * 1024 * 1024; // 64 MBs - backend_state->shm_growth_byte_size = 64 * 1024 * 1024; // 64 MBs + backend_state->shm_default_byte_size = 1 * 1024 * 1024; // 1 MB + backend_state->shm_growth_byte_size = 1 * 1024 * 1024; // 1 MB backend_state->stub_timeout_seconds = 30; backend_state->shm_message_queue_size = 1000; - backend_state->number_of_instance_inits = 0; backend_state->thread_pool_size = 32; + // Initialize shared memory region prefix to include backend's name + // to avoid collision between python backend and python-based backends. backend_state->shared_memory_region_prefix = - "triton_python_backend_shm_region_"; + "triton_" + name + "_backend_shm_region_"; + std::string default_backend_dir_string; if (backend_config.Find("cmdline", &cmdline)) { triton::common::TritonJson::Value shm_growth_size; @@ -1798,8 +2051,8 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) RETURN_IF_ERROR(shm_default_size.AsString(&shm_default_byte_size)); try { backend_state->shm_default_byte_size = std::stol(shm_default_byte_size); - // Shared memory default byte size can't be less than 4 MBs. - if (backend_state->shm_default_byte_size < 4 * 1024 * 1024) { + // Shared memory default byte size can't be less than 1 MB. + if (backend_state->shm_default_byte_size < 1 * 1024 * 1024) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("shm-default-byte-size") + @@ -1880,6 +2133,12 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what()); } } + + triton::common::TritonJson::Value default_backend_dir; + if (cmdline.Find("backend-directory", &default_backend_dir)) { + RETURN_IF_ERROR( + default_backend_dir.AsString(&default_backend_dir_string)); + } } LOG_MESSAGE( @@ -1893,12 +2152,65 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) .c_str()); // Use BackendArtifacts to determine the location of Python files - const char* location; + const char* clocation; TRITONBACKEND_ArtifactType artifact_type; RETURN_IF_ERROR( - TRITONBACKEND_BackendArtifacts(backend, &artifact_type, &location)); - backend_state->python_lib = location; + TRITONBACKEND_BackendArtifacts(backend, &artifact_type, &clocation)); + + const char os_slash = std::filesystem::path::preferred_separator; + std::string location(clocation); +#ifdef _WIN32 + const std::string stub_executable_name = "triton_python_backend_stub.exe"; + SanitizePath(location); + SanitizePath(default_backend_dir_string); +#else + const std::string stub_executable_name = "triton_python_backend_stub"; +#endif + // Check if `triton_python_backend_stub` and `triton_python_backend_utils.py` + // are located under `location`. + std::string default_python_backend_dir = + default_backend_dir_string + os_slash + "python"; + std::string backend_stub_path = location + os_slash + stub_executable_name; + std::string backend_utils = + location + os_slash + "triton_python_backend_utils.py"; + // Both, stub and utils should be in the same location + if (FileExists(backend_stub_path) && FileExists(backend_utils)) { + backend_state->python_lib = location; + // If `location` is default location of a python backend, + // then we are using default python backend. + if (default_python_backend_dir == location) { + backend_state->runtime_modeldir = ""; + } else { + // If `location` is not default location of a python backend, + // then we are using a python backend based backend and model.py stored + // in the received location. + backend_state->runtime_modeldir = location; + } + } else { + // If stub and utils are not found in received `location`, + // then we are using a python backend based backend and stub and utils are + // stored in the default python backend location. + if (!default_backend_dir_string.empty()) { + std::string backend_stub_path = default_backend_dir_string + os_slash + + "python" + os_slash + + stub_executable_name; + if (!FileExists(backend_stub_path)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_NOT_FOUND, + (stub_executable_name + " is not found. Searched paths: " + + default_backend_dir_string + os_slash + "python and " + location) + .c_str()); + } + } + backend_state->runtime_modeldir = location; + backend_state->python_lib = + default_backend_dir_string + os_slash + "python"; + } +// FIXME [DLIS-5969]: Enable for Windows when custom execution environments +// are supported. +#ifndef _WIN32 backend_state->env_manager = std::make_unique(); +#endif RETURN_IF_ERROR(TRITONBACKEND_BackendSetState( backend, reinterpret_cast(backend_state.get()))); @@ -1907,7 +2219,7 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend) { LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "TRITONBACKEND_Finalize: Start"); @@ -1919,7 +2231,7 @@ TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend) return nullptr; // success } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) { const char* cname; @@ -1946,7 +2258,7 @@ TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) { void* vstate; @@ -1962,7 +2274,7 @@ TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) { const char* cname; @@ -2005,7 +2317,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count) @@ -2018,29 +2330,10 @@ TRITONBACKEND_ModelInstanceExecute( // If restart is equal to true, it indicates that the stub process is // unhealthy and needs a restart. - bool restart = false; - ModelState* model_state = - reinterpret_cast(instance_state->Model()); - if (!model_state->IsDecoupled()) { - instance_state->ProcessRequests(requests, request_count, restart); - - if (restart) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - "Stub process is unhealthy and it will be restarted."); - instance_state->TerminateMonitor(); - instance_state->Stub()->KillStubProcess(); - TRITONSERVER_Error* err = instance_state->Stub()->Setup(); - if (err == nullptr) { - instance_state->StartMonitor(); - } - LOG_IF_ERROR(err, "Failed to restart the stub process."); - err = instance_state->Stub()->Launch(); - LOG_IF_ERROR(err, "Failed to restart the stub process."); - } - } else { - std::vector> infer_requests; + // TODO: Implement restart on decoupled + std::vector> infer_requests; + { uint64_t exec_start_ns = 0; SET_TIMESTAMP(exec_start_ns); @@ -2049,7 +2342,7 @@ TRITONBACKEND_ModelInstanceExecute( nullptr); reporter.SetExecStartNs(exec_start_ns); - error = instance_state->ProcessRequestsDecoupled( + error = instance_state->ProcessRequests( requests, request_count, infer_requests, reporter); uint64_t exec_end_ns = 0; @@ -2076,24 +2369,41 @@ TRITONBACKEND_ModelInstanceExecute( } } - // We should only delete the response factory for the requests that have - // not been closed. for (auto& infer_request : infer_requests) { - if (!instance_state->ExistsInClosedRequests( - infer_request->RequestAddress())) { - LOG_IF_ERROR( - infer_request->DeleteResponseFactory(), - "Failed to delete the response factory."); - } + // Reset the release flags for all the requests. + infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); } } } + // The InferRequest object might not be created if an error occurs. Explicitly + // update the release flags here based on the number of InferRequest objects. + std::vector request_release_flags( + request_count, TRITONSERVER_REQUEST_RELEASE_ALL); + for (size_t i = 0; i < infer_requests.size(); ++i) { + request_release_flags[i] = infer_requests[i]->ReleaseFlags(); + } + for (uint32_t r = 0; r < request_count; ++r) { TRITONBACKEND_Request* request = requests[r]; - LOG_IF_ERROR( - TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), - "failed releasing request"); + try { + THROW_IF_TRITON_ERROR( + TRITONBACKEND_RequestRelease(request, request_release_flags[r])); + } + catch (const PythonBackendException& pb_exception) { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("Failed to release request: ") + pb_exception.what()) + .c_str()); + if (request_release_flags[r] == TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + // If error occurs during request rescheduling, release the request with + // `TRITONSERVER_REQUEST_RELEASE_ALL` flag. + LOG_IF_ERROR( + TRITONBACKEND_RequestRelease( + request, TRITONSERVER_REQUEST_RELEASE_ALL), + "Failed to release request."); + } + } } LOG_MESSAGE( @@ -2106,7 +2416,7 @@ TRITONBACKEND_ModelInstanceExecute( return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) { void* vstate; @@ -2123,7 +2433,7 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_GetBackendAttribute( TRITONBACKEND_Backend* backend, TRITONBACKEND_BackendAttribute* backend_attributes) @@ -2143,6 +2453,11 @@ TRITONBACKEND_GetBackendAttribute( backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_CPU, 0, nullptr, 0)); #endif + // This backend can safely handle parallel calls to + // TRITONBACKEND_ModelInstanceInitialize (thread-safe). + RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeSetParallelModelInstanceLoading( + backend_attributes, true)); + return nullptr; } diff --git a/src/python_be.h b/src/python_be.h index bc9fb187..6082c50b 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -1,4 +1,4 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,12 +26,9 @@ #pragma once -#include #include #include -#include -#include -#include + #include #include #include @@ -57,11 +54,14 @@ #include #include #include + #include "infer_request.h" #include "infer_response.h" #include "ipc_message.h" #include "memory_manager.h" #include "message_queue.h" +#include "metric.h" +#include "metric_family.h" #include "pb_env.h" #include "pb_map.h" #include "pb_metric_reporter.h" @@ -80,6 +80,14 @@ #include "triton/core/tritonbackend.h" #include "triton/core/tritonserver.h" +#ifdef _WIN32 +#define NOMINMAX +#include +#else +#include +#include +#endif + #define LOG_IF_EXCEPTION(X) \ do { \ try { \ @@ -213,7 +221,13 @@ struct BackendState { std::atomic number_of_instance_inits; std::string shared_memory_region_prefix; int64_t thread_pool_size; + +// FIXME [DLIS-5969]: Enable for Windows when custom execution environments +// are supported. +#ifndef _WIN32 std::unique_ptr env_manager; +#endif + std::string runtime_modeldir; }; class ModelState : public BackendModel { @@ -233,12 +247,22 @@ class ModelState : public BackendModel { // Is decoupled API being used. bool IsDecoupled() { return decoupled_; } + // Set decoupled mode + void SetDecoupled(bool decoupled) { decoupled_ = decoupled; } + + // Returns the value in the `runtime_modeldir_` field + std::string RuntimeModelDir() { return runtime_modeldir_; } + // Launch auto-complete stub process. TRITONSERVER_Error* LaunchAutoCompleteStubProcess(); // Validate Model Configuration TRITONSERVER_Error* ValidateModelConfig(); + // Overrides `BackendModel::SetModelConfig` to also + // set `ModelState::decoupled_` + TRITONSERVER_Error* SetModelConfig(); + // Auto-complete stub std::unique_ptr& Stub() { return auto_complete_stub_; } @@ -248,6 +272,7 @@ class ModelState : public BackendModel { std::string python_execution_env_; bool force_cpu_only_input_tensors_; bool decoupled_; + std::string runtime_modeldir_; std::unique_ptr auto_complete_stub_; }; @@ -262,15 +287,13 @@ class ModelInstanceState : public BackendModelInstance { std::thread stub_to_parent_queue_monitor_; bool stub_to_parent_thread_; - // Decoupled monitor thread - std::thread decoupled_monitor_; - bool decoupled_thread_; std::mutex mu_; std::condition_variable cv_; std::unique_ptr received_message_; std::vector> futures_; std::unique_ptr thread_pool_; - std::unordered_map> infer_payload_; + std::unordered_map> infer_payload_; + std::mutex infer_payload_mu_; std::unique_ptr request_executor_; public: @@ -283,28 +306,12 @@ class ModelInstanceState : public BackendModelInstance { // Launch stub process. TRITONSERVER_Error* LaunchStubProcess(); - TRITONSERVER_Error* SendMessageToStub(off_t message); void ResponseSendDecoupled(std::shared_ptr response_send_message); - // Checks whether the stub process is live - bool IsStubProcessAlive(); - - // Get a message from the stub process - void SendMessageAndReceiveResponse( - off_t message, off_t& response, bool& restart, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // Responds to all the requests with an error message. - void RespondErrorToAllRequests( - const char* message, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // In the decoupled mode, the parent message queue is monitored only by this - // function during the execute phase. No other thread should pop any message - // from the message queue in the decoupled mode. - void DecoupledMessageQueueMonitor(); + // The parent message queue is monitored only by this function during the + // execute phase. No other thread should pop any message from the message + // queue. + void MessageQueueMonitor(); // This function is executed on a separate thread and monitors the queue for // message sent from stub to parent process. @@ -319,13 +326,8 @@ class ModelInstanceState : public BackendModelInstance { TRITONBACKEND_Request* request, std::shared_ptr>& responses); - // Process all the requests obtained from Triton. - void ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - bool& restart); - // Process all the requests in the decoupled mode. - TRITONSERVER_Error* ProcessRequestsDecoupled( + TRITONSERVER_Error* ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector>& pb_infer_requests, PbMetricReporter& pb_metric_reporter); @@ -339,9 +341,6 @@ class ModelInstanceState : public BackendModelInstance { // Cleanup BLS responses void CleanupBLSResponses(); - // Wait for BLS requests to complete - void WaitForBLSRequestsToFinish(); - // Check the incoming requests for errors TRITONSERVER_Error* CheckIncomingRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, @@ -359,6 +358,24 @@ class ModelInstanceState : public BackendModelInstance { AllocatedSharedMemory& request_batch, std::shared_ptr>& responses); + void SendMessageAndReceiveResponse( + bi::managed_external_buffer::handle_t message, + bi::managed_external_buffer::handle_t& response, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count); + + void RespondErrorToAllRequests( + const char* message, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count); + + // void SendMessageToStub(bi::managed_external_buffer::handle_t message); + TRITONSERVER_Error* SendMessageToStub( + bi::managed_external_buffer::handle_t message); + + // Checks whether the stub process is live + bool IsStubProcessAlive(); + // Model instance stub std::unique_ptr& Stub() { return model_instance_stub_; } @@ -383,7 +400,34 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr* infer_response, bi::managed_external_buffer::handle_t* response_handle); - // Process the bls decoupled cleanup request - void ProcessBLSCleanupRequest(const std::unique_ptr& message); + // Process the decoupled cleanup request for InferPayload and ResponseFactory + void ProcessCleanupRequest(const std::unique_ptr& message); + + // Process cancelling a BLS request + void ProcessCancelBLSRequest(const std::unique_ptr& message); + + // Process request cancellation query + void ProcessIsRequestCancelled(const std::unique_ptr& message); + + // Process a message. The function 'request_handler' is invoked + // to handle the request. T should be either 'MetricFamily', 'Metric' or + // 'ModelLoader', and MessageType should be either 'MetricFamilyMessage', + // 'MetricMessage' or 'ModelLoaderMessage'. + template + void ProcessMessage( + const std::unique_ptr& message, + std::function&, MessageType*)> request_handler); + + // Process a metric family request + void ProcessMetricFamilyRequest(const std::unique_ptr& message); + + // Process a metric request + void ProcessMetricRequest(const std::unique_ptr& message); + + // Process a model control request + void ProcessModelControlRequest(const std::unique_ptr& message); + + // Attempt to share CUDA memory pool with the stub process + void ShareCUDAMemoryPool(const int32_t device_id); }; }}} // namespace triton::backend::python diff --git a/src/request_executor.cc b/src/request_executor.cc index 43556e70..716d3c56 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,6 +27,8 @@ #include "request_executor.h" #include + +#include "correlation_id.h" #include "pb_utils.h" #include "scoped_defer.h" #include "triton/backend/backend_common.h" @@ -47,10 +49,10 @@ MemoryTypeToTritonMemoryType( const PreferredMemory::MemoryType& memory_type) { switch (memory_type) { - case PreferredMemory::MemoryType::CPU: + case PreferredMemory::MemoryType::kCPU: *triton_memory_type = TRITONSERVER_MEMORY_CPU; break; - case PreferredMemory::MemoryType::GPU: + case PreferredMemory::MemoryType::kGPU: *triton_memory_type = TRITONSERVER_MEMORY_GPU; break; @@ -67,9 +69,15 @@ InferRequestComplete( TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp) { if (request != nullptr) { + RequestCompletionUserp* completion_userp = + reinterpret_cast(userp); + completion_userp->infer_payload->SetRequestAddress(0L); + LOG_IF_ERROR( TRITONSERVER_InferenceRequestDelete(request), "Failed to delete inference request."); + + delete completion_userp; } } @@ -77,14 +85,22 @@ void InferResponseComplete( TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp) { - auto p = reinterpret_cast(userp); + auto linfer_payload = reinterpret_cast(userp); + std::shared_ptr infer_payload = linfer_payload->GetPtr(); std::unique_ptr infer_response; std::vector> output_tensors; std::shared_ptr pb_error; + std::string parameters_string; + TRITONSERVER_Error_Code error_code = TRITONSERVER_ERROR_INTERNAL; if (response != nullptr) { try { - THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseError(response)); + TRITONSERVER_Error* server_error = + TRITONSERVER_InferenceResponseError(response); + if (server_error != nullptr) { + error_code = TRITONSERVER_ErrorCode(server_error); + } + THROW_IF_TRITON_ERROR(server_error); uint32_t output_count; THROW_IF_TRITON_ERROR( @@ -107,7 +123,6 @@ InferResponseComplete( std::string sname = cname; std::vector dims_vector{shape, shape + dim_count}; - // userp is only set for the CPU tensors if (memory_type != TRITONSERVER_MEMORY_GPU) { if (byte_size != 0) { std::shared_ptr pb_tensor = std::make_shared( @@ -127,12 +142,49 @@ InferResponseComplete( nullptr /* DLManagedTensor */)); } } else { - output_tensors.push_back(std::make_shared( + std::shared_ptr pb_tensor = std::make_shared( sname, dims_vector, datatype, memory_type, memory_type_id, const_cast(base), byte_size, - nullptr /* DLManagedTensor */)); + nullptr /* DLManagedTensor */); + + std::unique_ptr pb_memory( + reinterpret_cast(userp)); + pb_tensor->SetMemory(std::move(pb_memory)); + output_tensors.push_back(pb_tensor); } } + + triton::common::TritonJson::Value parameters_json( + triton::common::TritonJson::ValueType::OBJECT); + uint32_t parameter_count; + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseParameterCount( + response, ¶meter_count)); + + for (size_t i = 0; i < parameter_count; i++) { + const char* name; + TRITONSERVER_ParameterType type; + const void* vvalue; + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseParameter( + response, i, &name, &type, &vvalue)); + if (type == TRITONSERVER_PARAMETER_INT) { + THROW_IF_TRITON_ERROR(parameters_json.AddInt( + name, *(reinterpret_cast(vvalue)))); + } else if (type == TRITONSERVER_PARAMETER_BOOL) { + THROW_IF_TRITON_ERROR(parameters_json.AddBool( + name, *(reinterpret_cast(vvalue)))); + } else if (type == TRITONSERVER_PARAMETER_STRING) { + std::string string = reinterpret_cast(vvalue); + THROW_IF_TRITON_ERROR(parameters_json.AddString(name, string)); + } else { + throw PythonBackendException( + (std::string("Unsupported parameter type for parameter '") + + name + "'.")); + } + } + + triton::common::TritonJson::WriteBuffer buffer; + THROW_IF_TRITON_ERROR(parameters_json.Write(&buffer)); + parameters_string = buffer.Contents(); } catch (const PythonBackendException& pb_exception) { if (response != nullptr) { @@ -142,24 +194,25 @@ InferResponseComplete( response = nullptr; } - pb_error = std::make_shared(pb_exception.what()); + pb_error = std::make_shared(pb_exception.what(), error_code); output_tensors.clear(); } - if (!p->IsDecoupled()) { + if (!infer_payload->IsDecoupled()) { infer_response = std::make_unique( - output_tensors, pb_error, true /* is_last_response */); + output_tensors, pb_error, parameters_string, + true /* is_last_response */); } else { if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { - // Not the last reponse. + // Not the last response. infer_response = std::make_unique( - output_tensors, pb_error, false /* is_last_response */, - userp /* id */); + output_tensors, pb_error, parameters_string, + false /* is_last_response */, userp /* id */); } else { // The last response. infer_response = std::make_unique( - output_tensors, pb_error, true /* is_last_response */, - userp /* id */); + output_tensors, pb_error, parameters_string, + true /* is_last_response */, userp /* id */); } } @@ -167,23 +220,20 @@ InferResponseComplete( TRITONSERVER_InferenceResponseDelete(response), "Failed to release BLS inference response."); } else if ( - p->IsDecoupled() && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { - // An empty response may be the last reponse for decoupled models. + (infer_payload)->IsDecoupled() && + (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { + // An empty response may be the last response for decoupled models. infer_response = std::make_unique( - output_tensors, pb_error, true /* is_last_response */, userp /* id */); + output_tensors, pb_error, "" /* parameters */, + true /* is_last_response */, userp /* id */); } else { pb_error = std::make_shared("Unexpected empty response."); infer_response = std::make_unique( - output_tensors, pb_error, true /* is_last_response */, userp /* id */); + output_tensors, pb_error, "" /* parameters */, + true /* is_last_response */, userp /* id */); } - // Only set value to the promise with the first response. Call the callback - // function to send decoupled response to the stub. - if (p->IsPromiseSet()) { - p->Callback(std::move(infer_response)); - } else { - p->SetValueForPrevPromise(std::move(infer_response)); - } + infer_payload->SetValue(std::move(infer_response)); } TRITONSERVER_Error* @@ -201,7 +251,7 @@ ResponseAlloc( ScopedDefer _([&shm_pool] { shm_pool.release(); }); if (p->preferred_memory.PreferredMemoryType() == - PreferredMemory::MemoryType::DEFAULT) { + PreferredMemory::MemoryType::kDefault) { *actual_memory_type = preferred_memory_type; *actual_memory_type_id = preferred_memory_type_id; } else { @@ -244,24 +294,27 @@ ResponseAlloc( } break; #ifdef TRITON_ENABLE_GPU case TRITONSERVER_MEMORY_GPU: { - auto err = cudaSetDevice(*actual_memory_type_id); - if ((err != cudaSuccess) && (err != cudaErrorNoDevice) && - (err != cudaErrorInsufficientDriver)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "unable to set current CUDA device: " + - std::string(cudaGetErrorString(err))) - .c_str()); - } + BackendMemory* backend_memory; + std::unique_ptr lbackend_memory; + try { + THROW_IF_TRITON_ERROR(BackendMemory::Create( + reinterpret_cast( + shm_pool->GetCUDAMemoryPoolManager()->TritonMemoryManager()), + {BackendMemory::AllocationType::GPU_POOL, + BackendMemory::AllocationType::GPU}, + *actual_memory_type_id, byte_size, &backend_memory)); + lbackend_memory.reset(backend_memory); - err = cudaMalloc(buffer, byte_size); - if (err != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "cudaMalloc failed: " + std::string(cudaGetErrorString(err))) - .c_str()); + std::unique_ptr pb_memory = PbMemory::Create( + shm_pool, std::move(lbackend_memory), true /* copy_gpu */); + *buffer = pb_memory->DataPtr(); + *buffer_userp = reinterpret_cast(pb_memory.get()); + pb_memory.release(); + } + catch (const PythonBackendException& pb_exception) { + TRITONSERVER_Error* err = + CreateTritonErrorFromException(pb_exception); + return err; } break; } @@ -272,6 +325,18 @@ ResponseAlloc( return nullptr; // Success } +void +InferRequestCancel(intptr_t request_address) +{ + if (request_address == 0L) { + return; + } + + TRITONSERVER_InferenceRequest* irequest = + reinterpret_cast(request_address); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestCancel(irequest)); +} + TRITONSERVER_Error* OutputBufferQuery( TRITONSERVER_ResponseAllocator* allocator, void* userp, @@ -314,6 +379,7 @@ RequestExecutor::Infer( bool is_ready = false; const char* model_name = infer_request->ModelName().c_str(); TRITONSERVER_InferenceRequest* irequest = nullptr; + RequestCompletionUserp* completion_userp = nullptr; try { int64_t model_version = infer_request->ModelVersion(); @@ -339,8 +405,8 @@ RequestExecutor::Infer( std::string("Model ") + model_name + " is using the decoupled. The current BLS request call doesn't " "support models using the decoupled transaction policy. Please use " - "stream API 'stream_exec()' or 'async_stream_exec() for decoupled " - "models.'"); + "'decoupled=True' argument to the 'exec' or 'async_exec' calls for " + "decoupled models.'"); } // Inference @@ -350,8 +416,14 @@ RequestExecutor::Infer( THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetId( irequest, infer_request->RequestId().c_str())); - THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetCorrelationId( - irequest, infer_request->CorrelationId())); + if (infer_request->GetCorrelationId().Type() == + CorrelationIdDataType::UINT64) { + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetCorrelationId( + irequest, infer_request->GetCorrelationId().UnsignedIntValue())); + } else { + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetCorrelationIdString( + irequest, infer_request->GetCorrelationId().StringValue().c_str())); + } THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetFlags( irequest, infer_request->Flags())); @@ -359,8 +431,48 @@ RequestExecutor::Infer( THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetTimeoutMicroseconds( irequest, infer_request->Timeout())); + completion_userp = new RequestCompletionUserp(infer_payload); THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback( - irequest, InferRequestComplete, nullptr /* request_release_userp */)); + irequest, InferRequestComplete, + reinterpret_cast(completion_userp))); + + TRITONSERVER_InferenceTrace* trace = nullptr; + if (infer_request->GetTrace().TritonTrace() != nullptr) { + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceTraceSpawnChildTrace( + reinterpret_cast( + infer_request->GetTrace().TritonTrace()), + &trace)); + } + + const std::string& param_str = infer_request->Parameters(); + triton::common::TritonJson::Value param; + THROW_IF_TRITON_ERROR(param.Parse(param_str.c_str(), param_str.length())); + std::vector param_keys; + THROW_IF_TRITON_ERROR(param.Members(¶m_keys)); + for (const auto& key : param_keys) { + triton::common::TritonJson::Value value; + if (!param.Find(key.c_str(), &value)) { + throw PythonBackendException("Unexpected missing key on parameters"); + } + if (value.IsString()) { + std::string string_value; + THROW_IF_TRITON_ERROR(value.AsString(&string_value)); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetStringParameter( + irequest, key.c_str(), string_value.c_str())); + } else if (value.IsInt()) { + int64_t int_value = 0; + THROW_IF_TRITON_ERROR(value.AsInt(&int_value)); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetIntParameter( + irequest, key.c_str(), int_value)); + } else if (value.IsBool()) { + bool bool_value = false; + THROW_IF_TRITON_ERROR(value.AsBool(&bool_value)); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetBoolParameter( + irequest, key.c_str(), bool_value)); + } else { + throw PythonBackendException("Unsupported value type on parameters"); + } + } for (auto& infer_input : infer_request->Inputs()) { THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestAddInput( @@ -391,11 +503,21 @@ RequestExecutor::Infer( reinterpret_cast(infer_payload->ResponseAllocUserp().get()), InferResponseComplete, reinterpret_cast(infer_payload.get()))); - THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync( - server_, irequest, nullptr /* trace */)); + // Store the inference request address submitted to the Triton server for + // retrieval + infer_payload->SetRequestAddress(reinterpret_cast(irequest)); + infer_payload->SetRequestCancellationFunc(InferRequestCancel); + + THROW_IF_TRITON_ERROR( + TRITONSERVER_ServerInferAsync(server_, irequest, trace)); } } catch (const PythonBackendException& pb_exception) { + infer_payload->SetRequestAddress(0L); + if (completion_userp != nullptr) { + delete completion_userp; + } + LOG_IF_ERROR( TRITONSERVER_InferenceRequestDelete(irequest), "Failed to delete inference request."); diff --git a/src/request_executor.h b/src/request_executor.h index 56ed5ca5..07562d6a 100644 --- a/src/request_executor.h +++ b/src/request_executor.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,6 +27,7 @@ #pragma once #include + #include "infer_payload.h" #include "infer_request.h" #include "infer_response.h" @@ -36,6 +37,12 @@ namespace triton { namespace backend { namespace python { TRITONSERVER_Error* CreateTritonErrorFromException( const PythonBackendException& pb_exception); +struct RequestCompletionUserp { + std::shared_ptr infer_payload; + RequestCompletionUserp(std::shared_ptr& infer_payload) + : infer_payload(infer_payload){}; +}; + class RequestExecutor { TRITONSERVER_ResponseAllocator* response_allocator_ = nullptr; TRITONSERVER_Server* server_; diff --git a/src/resources/triton_python_backend_utils.py b/src/resources/triton_python_backend_utils.py index e2045429..de332cf7 100644 --- a/src/resources/triton_python_backend_utils.py +++ b/src/resources/triton_python_backend_utils.py @@ -1,4 +1,4 @@ -# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,24 +24,25 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import numpy as np -import struct import json +import struct + +import numpy as np TRITON_STRING_TO_NUMPY = { - 'TYPE_BOOL': bool, - 'TYPE_UINT8': np.uint8, - 'TYPE_UINT16': np.uint16, - 'TYPE_UINT32': np.uint32, - 'TYPE_UINT64': np.uint64, - 'TYPE_INT8': np.int8, - 'TYPE_INT16': np.int16, - 'TYPE_INT32': np.int32, - 'TYPE_INT64': np.int64, - 'TYPE_FP16': np.float16, - 'TYPE_FP32': np.float32, - 'TYPE_FP64': np.float64, - 'TYPE_STRING': np.object_ + "TYPE_BOOL": bool, + "TYPE_UINT8": np.uint8, + "TYPE_UINT16": np.uint16, + "TYPE_UINT32": np.uint32, + "TYPE_UINT64": np.uint64, + "TYPE_INT8": np.int8, + "TYPE_INT16": np.int16, + "TYPE_INT32": np.int32, + "TYPE_INT64": np.int64, + "TYPE_FP16": np.float16, + "TYPE_FP32": np.float32, + "TYPE_FP64": np.float64, + "TYPE_STRING": np.object_, } @@ -71,10 +72,9 @@ def serialize_byte_tensor(input_tensor): # If the input is a tensor of string/bytes objects, then must flatten those # into a 1-dimensional array containing the 4-byte byte size followed by the # actual element bytes. All elements are concatenated together in "C" order. - if (input_tensor.dtype == np.object_) or (input_tensor.dtype.type - == np.bytes_): + if (input_tensor.dtype == np.object_) or (input_tensor.dtype.type == np.bytes_): flattened_ls = [] - for obj in np.nditer(input_tensor, flags=["refs_ok"], order='C'): + for obj in np.nditer(input_tensor, flags=["refs_ok"], order="C"): # If directly passing bytes to BYTES type, # don't convert it to str as Python will encode the # bytes which may distort the meaning @@ -82,12 +82,12 @@ def serialize_byte_tensor(input_tensor): if type(obj.item()) == bytes: s = obj.item() else: - s = str(obj.item()).encode('utf-8') + s = str(obj.item()).encode("utf-8") else: s = obj.item() flattened_ls.append(struct.pack(" max_batch_size: raise ValueError( - "configuration specified max_batch_size " + - str(self._model_config["max_batch_size"]) + - ", but in auto-complete-config function for model '" + - self._model_config["name"] + "' specified max_batch_size " + - str(max_batch_size)) + "configuration specified max_batch_size " + + str(self._model_config["max_batch_size"]) + + ", but in auto-complete-config function for model '" + + self._model_config["name"] + + "' specified max_batch_size " + + str(max_batch_size) + ) else: self._model_config["max_batch_size"] = max_batch_size def set_dynamic_batching(self): - """Set dynamic_batching as the scheduler for the model if no scheduler - is set. If dynamic_batching is set in the model configuration, then no + """Set dynamic_batching as the scheduler for the model if no scheduler + is set. If dynamic_batching is set in the model configuration, then no action is taken and return success. Raises ------ ValueError - If the 'sequence_batching' or 'ensemble_scheduling' scheduler is + If the 'sequence_batching' or 'ensemble_scheduling' scheduler is set for this model configuration. """ found_scheduler = None @@ -359,10 +361,13 @@ def set_dynamic_batching(self): if found_scheduler != None: raise ValueError( - "Configuration specified scheduling_choice as '" \ - + found_scheduler + "', but auto-complete-config " \ - "function for model '" + self._model_config["name"] - + "' tries to set scheduling_choice as 'dynamic_batching'") + "Configuration specified scheduling_choice as '" + + found_scheduler + + "', but auto-complete-config " + "function for model '" + + self._model_config["name"] + + "' tries to set scheduling_choice as 'dynamic_batching'" + ) if "dynamic_batching" not in self._model_config: self._model_config["dynamic_batching"] = {} @@ -376,58 +381,92 @@ def add_input(self, input): Raises ------ ValueError - If input contains property other than 'name', 'data_type' - and 'dims' or any of the properties are not set, or if an - input with the same name already exists in the configuration - but has different data_type or dims property + If input contains property other than 'name', 'data_type', + 'dims', 'optional' or any of the non-optional properties + are not set, or if an input with the same name already exists + in the configuration but has different data_type or dims property """ - valid_properties = ['name', 'data_type', 'dims'] + valid_properties = ["name", "data_type", "dims", "optional"] for current_property in input: if current_property not in valid_properties: raise ValueError( - "input '" + input['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' contains property other than 'name', 'data_type' and 'dims'." + "input '" + + input["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' contains property other than 'name', 'data_type', 'dims' and 'optional'." ) - if 'name' not in input: + if "name" not in input: + raise ValueError( + "input in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'name' property." + ) + elif "data_type" not in input: raise ValueError( - "input in auto-complete-config function for model '" + - self._model_config["name"] + "' is missing 'name' property.") - elif 'data_type' not in input: - raise ValueError("input '" + input['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' is missing 'data_type' property.") - elif 'dims' not in input: - raise ValueError("input '" + input['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' is missing 'dims' property.") + "input '" + + input["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'data_type' property." + ) + elif "dims" not in input: + raise ValueError( + "input '" + + input["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'dims' property." + ) for current_input in self._model_config["input"]: - if input['name'] == current_input['name']: - if current_input[ - 'data_type'] != "TYPE_INVALID" and current_input[ - 'data_type'] != input['data_type']: - raise ValueError("unable to load model '" + - self._model_config["name"] + - "', configuration expects datatype " + - current_input['data_type'] + - " for input '" + input['name'] + - "', model provides " + input['data_type']) - elif current_input[ - 'dims'] and current_input['dims'] != input['dims']: + if input["name"] == current_input["name"]: + if ( + current_input["data_type"] != "TYPE_INVALID" + and current_input["data_type"] != input["data_type"] + ): + raise ValueError( + "unable to load model '" + + self._model_config["name"] + + "', configuration expects datatype " + + current_input["data_type"] + + " for input '" + + input["name"] + + "', model provides " + + input["data_type"] + ) + elif current_input["dims"] and current_input["dims"] != input["dims"]: raise ValueError( - "model '" + self._model_config["name"] + "', tensor '" + - input['name'] + "': the model expects dims " + - str(input['dims']) + - " but the model configuration specifies dims " + - str(current_input['dims'])) + "model '" + + self._model_config["name"] + + "', tensor '" + + input["name"] + + "': the model expects dims " + + str(input["dims"]) + + " but the model configuration specifies dims " + + str(current_input["dims"]) + ) + elif ( + "optional" in current_input + and "optional" in input + and current_input["optional"] != input["optional"] + ): + raise ValueError( + "model '" + + self._model_config["name"] + + "', tensor '" + + input["name"] + + "': the model expects optional " + + str(input["optional"]) + + " but the model configuration specifies optional " + + str(current_input["optional"]) + ) else: - current_input['data_type'] = input['data_type'] - current_input['dims'] = input['dims'] + current_input["data_type"] = input["data_type"] + current_input["dims"] = input["dims"] + if "optional" in input: + current_input["optional"] = input["optional"] return self._model_config["input"].append(input) @@ -446,58 +485,126 @@ def add_output(self, output): output with the same name already exists in the configuration but has different data_type or dims property """ - valid_properties = ['name', 'data_type', 'dims'] + valid_properties = ["name", "data_type", "dims"] for current_property in output: if current_property not in valid_properties: raise ValueError( - "output '" + output['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' contains property other than 'name', 'data_type' and 'dims'." + "output '" + + output["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' contains property other than 'name', 'data_type' and 'dims'." ) - if 'name' not in output: + if "name" not in output: + raise ValueError( + "output in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'name' property." + ) + elif "data_type" not in output: raise ValueError( - "output in auto-complete-config function for model '" + - self._model_config["name"] + "' is missing 'name' property.") - elif 'data_type' not in output: - raise ValueError("output '" + output['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' is missing 'data_type' property.") - elif 'dims' not in output: - raise ValueError("output '" + output['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' is missing 'dims' property.") + "output '" + + output["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'data_type' property." + ) + elif "dims" not in output: + raise ValueError( + "output '" + + output["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'dims' property." + ) for current_output in self._model_config["output"]: - if output['name'] == current_output['name']: - if current_output[ - 'data_type'] != "TYPE_INVALID" and current_output[ - 'data_type'] != output['data_type']: - raise ValueError("unable to load model '" + - self._model_config["name"] + - "', configuration expects datatype " + - current_output['data_type'] + - " for output '" + output['name'] + - "', model provides " + output['data_type']) - elif current_output[ - 'dims'] and current_output['dims'] != output['dims']: + if output["name"] == current_output["name"]: + if ( + current_output["data_type"] != "TYPE_INVALID" + and current_output["data_type"] != output["data_type"] + ): + raise ValueError( + "unable to load model '" + + self._model_config["name"] + + "', configuration expects datatype " + + current_output["data_type"] + + " for output '" + + output["name"] + + "', model provides " + + output["data_type"] + ) + elif ( + current_output["dims"] and current_output["dims"] != output["dims"] + ): raise ValueError( - "model '" + self._model_config["name"] + "', tensor '" + - output['name'] + "': the model expects dims " + - str(output['dims']) + - " but the model configuration specifies dims " + - str(current_output['dims'])) + "model '" + + self._model_config["name"] + + "', tensor '" + + output["name"] + + "': the model expects dims " + + str(output["dims"]) + + " but the model configuration specifies dims " + + str(current_output["dims"]) + ) else: - current_output['data_type'] = output['data_type'] - current_output['dims'] = output['dims'] + current_output["data_type"] = output["data_type"] + current_output["dims"] = output["dims"] return self._model_config["output"].append(output) + def set_model_transaction_policy(self, transaction_policy_dict): + """ + Set model transaction policy for the model. + Parameters + ---------- + transaction_policy_dict : dict + The dict, containing all properties to be set as a part + of `model_transaction_policy` field. + Raises + ------ + ValueError + If transaction_policy_dict contains property other + than 'decoupled', or if `model_transaction_policy` already exists + in the configuration, but has different `decoupled` property. + """ + valid_properties = ["decoupled"] + for current_property in transaction_policy_dict.keys(): + if current_property not in valid_properties: + raise ValueError( + "model transaction property in auto-complete-config " + + "function for model '" + + self._model_config["name"] + + "' contains property other than 'decoupled'." + ) + + if "model_transaction_policy" not in self._model_config: + self._model_config["model_transaction_policy"] = {} + + if "decoupled" in transaction_policy_dict.keys(): + if ( + "decoupled" in self._model_config["model_transaction_policy"] + and self._model_config["model_transaction_policy"]["decoupled"] + != transaction_policy_dict["decoupled"] + ): + raise ValueError( + "trying to change decoupled property in auto-complete-config " + + "for model '" + + self._model_config["name"] + + "', which is already set to '" + + str(self._model_config["model_transaction_policy"]["decoupled"]) + + "'." + ) + + self._model_config["model_transaction_policy"][ + "decoupled" + ] = transaction_policy_dict["decoupled"] + TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1 TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2 TRITONSERVER_RESPONSE_COMPLETE_FINAL = 1 +TRITONSERVER_REQUEST_RELEASE_ALL = 1 +TRITONSERVER_REQUEST_RELEASE_RESCHEDULE = 2 diff --git a/src/response_sender.cc b/src/response_sender.cc index e8394df9..ef3b09dd 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,47 +25,108 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "response_sender.h" + #include #include + #include "pb_stub.h" #include "pb_stub_utils.h" #include "scoped_defer.h" namespace triton { namespace backend { namespace python { +void +CheckResponseSenderArguments( + const std::shared_ptr& response, const uint32_t flags) +{ + // Check the correctness of the provided flags. + if (flags != TRITONSERVER_RESPONSE_COMPLETE_FINAL && flags != 0) { + throw PythonBackendException( + "Unable to send response. Unsupported flag provided."); + } + + if (flags == 0 && response == nullptr) { + throw PythonBackendException( + "Inference Response object must be provided when the response flags is " + "set to zero."); + } +} + ResponseSender::ResponseSender( intptr_t request_address, intptr_t response_factory_address, - std::unique_ptr& shm_pool) + bool const* is_decoupled, + const std::set& requested_output_names, + std::unique_ptr& shm_pool, + const std::shared_ptr& pb_cancel) : request_address_(request_address), - response_factory_address_(response_factory_address), shm_pool_(shm_pool), - closed_(false) + response_factory_address_(response_factory_address), + is_decoupled_(is_decoupled), + requested_output_names_(requested_output_names), shm_pool_(shm_pool), + pb_cancel_(pb_cancel), closed_(false), number_of_response_sent_(0), + response_factory_deleted_(false) { } +ResponseSender::~ResponseSender() +{ + DeleteResponseFactory(); +} void -ResponseSender::Send( - std::shared_ptr infer_response, const uint32_t flags) +ResponseSender::UpdateStateAndCounters( + InferResponse* response, const uint32_t flags) { + if (is_decoupled_ == nullptr) { + // TODO: Can a model access the response sender on a BLS infer request? + throw PythonBackendException( + "Unable to send response. Response sender has no reference to the " + "decoupled state of the model."); + } + bool is_decoupled = *is_decoupled_; + + std::lock_guard lk(mu_); + + if (!is_decoupled) { + if (response != nullptr && number_of_response_sent_ > 0) { + throw PythonBackendException( + "Unable to send response. Non-decoupled model cannot send more than " + "one response."); + } + if (response == nullptr && flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL && + number_of_response_sent_ == 0) { + throw PythonBackendException( + "Unable to send response. Non-decoupled model cannot send complete " + "final before sending a response."); + } + } + if (closed_) { throw PythonBackendException( "Unable to send response. Response sender has been closed."); } if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + response_factory_deleted_.exchange(true); closed_ = true; } + number_of_response_sent_++; +} - // Check the correctness of the provided flags. - if (flags != TRITONSERVER_RESPONSE_COMPLETE_FINAL && flags != 0) { - throw PythonBackendException( - "Unable to send response. Unsupported flag provided."); - } +void +ResponseSender::Send( + std::shared_ptr infer_response, const uint32_t flags) +{ + // Release the GIL. This avoids a potential deadlock situation in the parent + // process, where every thread in the thread pool is indirectly waiting for a + // function in the stub process that acquires the GIL. Meanwhile, the current + // thread, which holds the GIL, is also waiting for the parent side to have + // the next available thread to pick up the job during resource contention. + py::gil_scoped_release release; - if (flags == 0 && infer_response == nullptr) { - throw PythonBackendException( - "Inference Response object must be provided when the response flags is " - "set to zero."); + CheckResponseSenderArguments(infer_response, flags); + UpdateStateAndCounters(infer_response.get(), flags); + if (infer_response) { + infer_response->PruneOutputTensors(requested_output_names_); } std::unique_ptr& stub = Stub::GetOrCreateInstance(); @@ -112,7 +173,11 @@ ResponseSender::Send( { bi::scoped_lock guard{send_message_payload->mu}; - stub->SendIPCMessage(ipc_message); + // The server will destruct the response factory if the final flag is set. + if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + response_factory_deleted_.exchange(true); + } + stub->SendIPCUtilsMessage(ipc_message); while (!send_message_payload->is_stub_turn) { send_message_payload->cv.wait(guard); } @@ -130,43 +195,50 @@ ResponseSender::Send( } if (has_gpu_output) { - AllocatedSharedMemory gpu_buffers_handle = - shm_pool_->Load(send_message_payload->gpu_buffers_handle); - - bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = - reinterpret_cast( - gpu_buffers_handle.data_.get() + sizeof(uint64_t)); - uint64_t* gpu_buffer_count = - reinterpret_cast(gpu_buffers_handle.data_.get()); - if (gpu_tensors.size() != *gpu_buffer_count) { - LOG_INFO - << (std::string( - "GPU buffers size does not match the provided buffers: ") + - std::to_string(gpu_tensors.size()) + - " != " + std::to_string(*gpu_buffer_count)); - return; + ScopedDefer _([send_message_payload] { + bi::scoped_lock guard{send_message_payload->mu}; + send_message_payload->is_stub_turn = false; + send_message_payload->cv.notify_one(); + while (!send_message_payload->is_stub_turn) { + // Wait for the stub process to send the response and populate error + // message if any. + send_message_payload->cv.wait(guard); + } + }); + + AllocatedSharedMemory gpu_buffers_handle = + shm_pool_->Load( + send_message_payload->gpu_buffers_handle); + if (!gpu_buffers_handle.data_->success) { + std::unique_ptr error = PbString::LoadFromSharedMemory( + shm_pool_, gpu_buffers_handle.data_->error); + throw PythonBackendException( + "Failed to load GPU buffers: " + error->String()); + } + + AllocatedSharedMemory + gpu_buffers_handle_shm = + shm_pool_->Load( + gpu_buffers_handle.data_->buffers); + uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count; + if (gpu_tensors.size() != gpu_buffer_count) { + throw PythonBackendException( + std::string( + "GPU buffers size does not match the provided buffers: ") + + std::to_string(gpu_tensors.size()) + + " != " + std::to_string(gpu_buffer_count)); } std::vector> dst_buffers; for (size_t i = 0; i < gpu_tensors.size(); i++) { std::unique_ptr dst_buffer = PbMemory::LoadFromSharedMemory( - shm_pool_, gpu_buffers_handle_shm[i], true /* open_cuda_handle */); + shm_pool_, gpu_buffers_handle_shm.data_.get()[i], + true /* open_cuda_handle */); dst_buffers.emplace_back(std::move(dst_buffer)); std::shared_ptr& src_buffer = gpu_tensors[i]; PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory()); } - - { - bi::scoped_lock guard{send_message_payload->mu}; - send_message_payload->is_stub_turn = false; - send_message_payload->cv.notify_one(); - while (!send_message_payload->is_stub_turn) { - // Wait for the stub process to send the response and populate error - // message if any. - send_message_payload->cv.wait(guard); - } - } } if (send_message_payload->has_error) { @@ -180,4 +252,38 @@ ResponseSender::Send( } } } + +bool +ResponseSender::IsCancelled() +{ + return pb_cancel_->IsCancelled(); +} + +bool +ResponseSender::IsClosed() +{ + std::lock_guard lk(mu_); + return closed_; +} + +void +ResponseSender::Close() +{ + std::lock_guard lk(mu_); + closed_ = true; + response_factory_deleted_.exchange(true); +} + +void +ResponseSender::DeleteResponseFactory() +{ + bool already_deleted = response_factory_deleted_.exchange(true); + if (!already_deleted) { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + stub->EnqueueCleanupId( + reinterpret_cast(response_factory_address_), + PYTHONSTUB_DecoupledResponseFactoryCleanup); + } +} + }}} // namespace triton::backend::python diff --git a/src/response_sender.h b/src/response_sender.h index 114f22c0..a696f9eb 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -1,4 +1,4 @@ -// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,7 +26,11 @@ #pragma once +#include +#include + #include "infer_response.h" +#include "pb_cancel.h" #include "shm_manager.h" namespace triton { namespace backend { namespace python { @@ -35,13 +39,34 @@ class ResponseSender { public: ResponseSender( intptr_t request_address, intptr_t response_factory_address, - std::unique_ptr& shm_pool); + bool const* is_decoupled, + const std::set& requested_output_names, + std::unique_ptr& shm_pool, + const std::shared_ptr& pb_cancel); + intptr_t ResponseFactory() { return response_factory_address_; } + ~ResponseSender(); void Send(std::shared_ptr response, const uint32_t flags); + bool IsCancelled(); + void UpdateStateAndCounters(InferResponse* response, const uint32_t flags); + + // Can be useful at stopping the model from sending any more responses. + void Close(); + bool IsClosed(); private: + void DeleteResponseFactory(); + intptr_t request_address_; intptr_t response_factory_address_; + bool const* is_decoupled_; + std::set requested_output_names_; std::unique_ptr& shm_pool_; + std::shared_ptr pb_cancel_; + + std::mutex mu_; bool closed_; + size_t number_of_response_sent_; + + std::atomic response_factory_deleted_; }; }}} // namespace triton::backend::python diff --git a/src/shm_manager.cc b/src/shm_manager.cc index 555bd023..134cee6f 100644 --- a/src/shm_manager.cc +++ b/src/shm_manager.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -24,15 +24,62 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#include "shm_manager.h" + #include #include #include #include -#include "shm_manager.h" - namespace triton { namespace backend { namespace python { +void +CUDAMemoryPoolManager::SetCUDAPoolAddress( + const int32_t device_id, void* cuda_pool_address) +{ + std::lock_guard lock(mu_); + cuda_pool_address_map_[device_id] = cuda_pool_address; +} + +void* +CUDAMemoryPoolManager::CUDAPoolAddress(const int32_t device_id) +{ + if (cuda_pool_address_map_.find(device_id) != cuda_pool_address_map_.end()) { + return cuda_pool_address_map_[device_id]; + } else { + throw PythonBackendException( + "CUDA pool address for device " + std::to_string(device_id) + + " is not set."); + } +} + +void +CUDAMemoryPoolManager::SetTritonMemoryManager(void* triton_memory_manager) +{ + triton_memory_manager_ = triton_memory_manager; +} + +void* +CUDAMemoryPoolManager::TritonMemoryManager() +{ + return triton_memory_manager_; +} + +bool +CUDAMemoryPoolManager::UseCudaSharedPool(const int32_t device_id) +{ + return (cuda_pool_address_map_.find(device_id) != + cuda_pool_address_map_.end()) && + (cuda_pool_address_map_[device_id] != nullptr) && + (triton_memory_manager_ != nullptr); +} + +std::unordered_map& +CUDAMemoryPoolManager::CUDAPoolAddressMap() +{ + return cuda_pool_address_map_; +} + SharedMemoryManager::SharedMemoryManager( const std::string& shm_region_name, size_t shm_size, size_t shm_growth_bytes, bool create) @@ -40,6 +87,7 @@ SharedMemoryManager::SharedMemoryManager( shm_region_name_ = shm_region_name; create_ = create; shm_growth_bytes_ = shm_growth_bytes; + cuda_memory_pool_manager_ = std::make_unique(); try { if (create) { @@ -76,7 +124,7 @@ SharedMemoryManager::SharedMemoryManager( "' to requested size (" + std::to_string(shm_size) + " bytes). If you are running Triton inside docker, use '--shm-size' " "flag to control the shared memory region size. Each Python backend " - "model instance requires at least 64MBs of shared memory. Error: " + + "model instance requires at least 1 MB of shared memory. Error: " + ex.what()); // Remove the shared memory region if there was an error. bi::shared_memory_object::remove(shm_region_name.c_str()); @@ -99,6 +147,7 @@ SharedMemoryManager::SharedMemoryManager(const std::string& shm_region_name) shm_region_name_ = shm_region_name; create_ = false; shm_growth_bytes_ = 1024; + cuda_memory_pool_manager_ = std::make_unique(); shm_obj_ = std::make_unique( bi::open_only, shm_region_name.c_str(), bi::read_write); @@ -139,8 +188,8 @@ SharedMemoryManager::GrowIfNeeded(uint64_t byte_size) } catch (bi::interprocess_exception& ex) { std::string error_message = - ("Failed to increase the shared memory pool size for key '" + - shm_region_name_ + "' to " + std::to_string(*total_size_) + + ("Failed to increase the shared memory pool size to " + + std::to_string(*total_size_) + " bytes. If you are running Triton inside docker, use '--shm-size' " "flag to control the shared memory region size. Error: " + ex.what()); diff --git a/src/shm_manager.h b/src/shm_manager.h index 108a3a44..e0799a07 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,21 +26,49 @@ #pragma once -#include #include #include #include +#include #include #include +#include #include #include +#include #include -#include + #include "pb_exception.h" namespace triton { namespace backend { namespace python { namespace bi = boost::interprocess; +class CUDAMemoryPoolManager { + public: + CUDAMemoryPoolManager() : triton_memory_manager_(nullptr) {} + + void SetCUDAPoolAddress(const int32_t device_id, void* cuda_pool_address); + + void* CUDAPoolAddress(const int32_t device_id); + + void SetTritonMemoryManager(void* triton_memory_manager); + + void* TritonMemoryManager(); + + bool UseCudaSharedPool(const int32_t device_id); + + // Return cuda pool address map + std::unordered_map& CUDAPoolAddressMap(); + + private: + // The base address of the Triton CUDA memory pool + std::unordered_map cuda_pool_address_map_; + // The mutex to protect the cuda_pool_address_map_ + std::mutex mu_; + // TRITONBACKEND_MemoryManager + void* triton_memory_manager_; +}; + template struct AllocatedSharedMemory { AllocatedSharedMemory() = default; @@ -62,9 +90,9 @@ struct AllocatedSharedMemory { // info is placed in the beginning and the actual object is placed after that // (i.e. 4 plus the aligned address is not 16-bytes aligned). The aligned memory // is required by semaphore otherwise it may lead to SIGBUS error on ARM. -struct AllocatedShmOwnership { +struct alignas(16) AllocatedShmOwnership { uint32_t ref_count_; -} __attribute__((aligned(16))); +}; class SharedMemoryManager { public: @@ -82,7 +110,7 @@ class SharedMemoryManager { bi::managed_external_buffer::handle_t handle = 0; { - bi::scoped_lock gaurd{*shm_mutex_}; + bi::scoped_lock guard{*shm_mutex_}; std::size_t requested_bytes = sizeof(T) * count + sizeof(AllocatedShmOwnership); GrowIfNeeded(0); @@ -119,7 +147,7 @@ class SharedMemoryManager { AllocatedShmOwnership* shm_ownership_data; { - bi::scoped_lock gaurd{*shm_mutex_}; + bi::scoped_lock guard{*shm_mutex_}; GrowIfNeeded(0); shm_ownership_data = reinterpret_cast( managed_buffer_->get_address_from_handle(handle)); @@ -138,7 +166,7 @@ class SharedMemoryManager { void Deallocate(bi::managed_external_buffer::handle_t handle) { - bi::scoped_lock gaurd{*shm_mutex_}; + bi::scoped_lock guard{*shm_mutex_}; GrowIfNeeded(0); void* ptr = managed_buffer_->get_address_from_handle(handle); managed_buffer_->deallocate(ptr); @@ -155,6 +183,14 @@ class SharedMemoryManager { void SetDeleteRegion(bool delete_region); + std::unique_ptr& GetCUDAMemoryPoolManager() + { + return cuda_memory_pool_manager_; + } + + uint64_t GetCurrentCapacity() { return current_capacity_; } + void* GetBaseAddress() { return managed_buffer_->get_address(); } + ~SharedMemoryManager() noexcept(false); private: @@ -169,6 +205,7 @@ class SharedMemoryManager { uint64_t* total_size_; bool create_; bool delete_region_; + std::unique_ptr cuda_memory_pool_manager_; template AllocatedSharedMemory WrapObjectInUniquePtr( @@ -179,7 +216,7 @@ class SharedMemoryManager { std::function deleter = [this, handle, shm_ownership_data](T* memory) { bool destroy = false; - bi::scoped_lock gaurd{*shm_mutex_}; + bi::scoped_lock guard{*shm_mutex_}; // Before using any shared memory function you need to make sure that you // are using the correct mapping. For example, shared memory growth may // happen between the time an object was created and the time the object diff --git a/src/shm_monitor/CMakeLists.txt b/src/shm_monitor/CMakeLists.txt index 0f7d4b86..2ae8bd45 100644 --- a/src/shm_monitor/CMakeLists.txt +++ b/src/shm_monitor/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -cmake_minimum_required (VERSION 3.18) +cmake_minimum_required (VERSION 3.31.8) pybind11_add_module( triton-shm-monitor diff --git a/src/shm_monitor/shm_monitor.cc b/src/shm_monitor/shm_monitor.cc index dfeb1fbb..e0c08d3c 100644 --- a/src/shm_monitor/shm_monitor.cc +++ b/src/shm_monitor/shm_monitor.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + #include "../shm_manager.h" namespace triton { namespace backend { namespace python { diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 545c528f..32f5d1bd 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -25,23 +25,31 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "stub_launcher.h" + +#include + +#include "pb_utils.h" #include "python_be.h" +#ifdef _WIN32 +#include // getpid() +#endif + +extern char** environ; + namespace triton { namespace backend { namespace python { StubLauncher::StubLauncher(const std::string stub_process_kind) - : parent_pid_(0), stub_pid_(0), is_initialized_(false), + : parent_pid_(0), is_initialized_(false), stub_process_kind_(stub_process_kind), model_instance_name_(""), device_id_(0), kind_("") - { } StubLauncher::StubLauncher( const std::string stub_process_kind, const std::string model_instance_name, const int32_t device_id, const std::string kind) - : parent_pid_(0), stub_pid_(0), is_initialized_(false), - stub_process_kind_(stub_process_kind), + : is_initialized_(false), stub_process_kind_(stub_process_kind), model_instance_name_(model_instance_name), device_id_(device_id), kind_(kind) { @@ -61,22 +69,30 @@ StubLauncher::Initialize(ModelState* model_state) model_state->ModelConfig().Write(&model_config_buffer_); is_decoupled_ = model_state->IsDecoupled(); model_repository_path_ = model_state->RepositoryPath(); + runtime_modeldir_ = model_state->RuntimeModelDir(); + if (runtime_modeldir_.empty()) { + runtime_modeldir_ = "DEFAULT"; + } +#ifdef _WIN32 + ZeroMemory(&startup_info_, sizeof(startup_info_)); + startup_info_.cb = sizeof(startup_info_); + ZeroMemory(&stub_pid_, sizeof(stub_pid_)); +#else + stub_pid_ = 0; +#endif - // Atomically increase and read the stub process count to avoid shared memory - // region name collision - int num_init = ++model_state->StateForBackend()->number_of_instance_inits; shm_region_name_ = model_state->StateForBackend()->shared_memory_region_prefix + - std::to_string(num_init); + GenerateUUID(); model_version_ = model_state->Version(); std::stringstream ss; + const char os_slash = std::filesystem::path::preferred_separator; + ss << model_repository_path_ << os_slash << model_version_ << os_slash; std::string artifact_name; RETURN_IF_ERROR(model_state->ModelConfig().MemberAsString( "default_model_filename", &artifact_name)); - ss << model_repository_path_ << "/" << model_version_ << "/"; - if (artifact_name.size() > 0) { ss << artifact_name; } else { @@ -85,40 +101,20 @@ StubLauncher::Initialize(ModelState* model_state) } model_path_ = ss.str(); - struct stat buffer; - // Check if model.py exists - if (stat(model_path_.c_str(), &buffer) != 0) { + // FIXME [DLIS-5969]: Enable for Windows when custom execution environments + // are supported. + if (python_execution_env_ != "") { +#ifndef _WIN32 + RETURN_IF_ERROR(GetPythonEnvironment(model_state)); +#else return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("model.py does not exist in the model repository path: " + model_path_) - .c_str()); + TRITONSERVER_ERROR_UNSUPPORTED, + "Custom execution environments are not currently supported on " + "Windows."); +#endif } - // Path to the extracted Python env - std::string python_execution_env = ""; - if (python_execution_env_ != "") { - try { - python_execution_env = - model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( - python_execution_env_); - } - catch (PythonBackendException& pb_exception) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); - } - - path_to_activate_ = python_execution_env + "/bin/activate"; - path_to_libpython_ = python_execution_env + "/lib"; - if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Path " + path_to_activate_ + - " does not exist. The Python environment should contain an " - "'activate' script.") - .c_str()); - } - } parent_pid_ = getpid(); @@ -201,6 +197,11 @@ StubLauncher::Setup() return nullptr; } +// FIXME: This should be merged with the Unix launch function once Windows +// CI and functionality are demonstrably stable. The goal of keeping the +// functions separate is to help debug Windows-specific issues without worrying +// about the impact to our Unix builds. +#ifdef _WIN32 TRITONSERVER_Error* StubLauncher::Launch() { @@ -211,62 +212,173 @@ StubLauncher::Launch() stub_name = model_instance_name_; } - const char* stub_args[4]; - stub_args[0] = "bash"; - stub_args[1] = "-c"; - stub_args[3] = nullptr; // Last argument must be nullptr + const char os_slash = std::filesystem::path::preferred_separator; + + const std::string stub_executable_name = "triton_python_backend_stub.exe"; + SanitizePath(model_path_); + SanitizePath(model_repository_path_); // Default Python backend stub - std::string python_backend_stub = python_lib_ + "/triton_python_backend_stub"; + std::string python_backend_stub = + python_lib_ + os_slash + stub_executable_name; + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Stub path ") + python_backend_stub).c_str()); // Path to alternative Python backend stub std::string model_python_backend_stub = - std::string(model_repository_path_) + "/triton_python_backend_stub"; + std::string(model_repository_path_) + os_slash + stub_executable_name; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Alt path ") + python_backend_stub).c_str()); + + // Check if file exists + // TODO: Integrate win32 and pb_env if (FileExists(model_python_backend_stub)) { python_backend_stub = model_python_backend_stub; } - std::string bash_argument; + std::string launch_command; - // This shared memory variable indicates whether the stub process should - // revert the LD_LIBRARY_PATH changes to avoid shared library issues in - // executables and libraries. - ipc_control_->uses_env = false; - if (python_execution_env_ != "") { - std::stringstream ss; + std::stringstream ss; + ss << python_backend_stub << " " << model_path_ << " " << shm_region_name_ + << " " << shm_default_byte_size_ << " " << shm_growth_byte_size_ << " " + << parent_pid_ << " " << python_lib_ << " " << ipc_control_handle_ << " " + << stub_name << " " << runtime_modeldir_; + launch_command = ss.str(); - // Need to properly set the LD_LIBRARY_PATH so that Python environments - // using different python versions load properly. - ss << "source " << path_to_activate_ - << " && exec env LD_LIBRARY_PATH=" << path_to_libpython_ - << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ - << " " << shm_region_name_ << " " << shm_default_byte_size_ << " " - << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ - << " " << ipc_control_handle_ << " " << stub_name; - ipc_control_->uses_env = true; - bash_argument = ss.str(); - } else { + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Starting Python backend stub: ") + launch_command).c_str()); + + LPSTR launch_command_lpstr = const_cast(launch_command.c_str()); + // Start the child process. Unlike fork(), the remainder of this + // function exists in the context of the parent, only. + if (!CreateProcess( + NULL, // No module name (use command line) + launch_command_lpstr, // Command line + NULL, // Process handle not inheritable + NULL, // Thread handle not inheritable + FALSE, // Set handle inheritance to FALSE + 0, // No creation flags + NULL, // Use parent's environment block + NULL, // Use parent's starting directory + &startup_info_, // Pointer to STARTUPINFO structure + &stub_pid_) // Pointer to PROCESS_INFORMATION structure + ) { std::stringstream ss; - ss << " exec " << python_backend_stub << " " << model_path_ << " " - << shm_region_name_ << " " << shm_default_byte_size_ << " " - << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ - << " " << ipc_control_handle_ << " " << stub_name; - bash_argument = ss.str(); + ss << "Failed to run python backend stub. Errno = " << errno << '\n' + << "Python backend stub path: " << python_backend_stub << '\n' + << "Shared Memory Region Name: " << shm_region_name_ << '\n' + << "Shared Memory Default Byte Size: " << shm_default_byte_size_ << '\n' + << "Shared Memory Growth Byte Size: " << shm_growth_byte_size_ << '\n'; + // Print the error message directly because the underlying mutexes in + // LOG_MESSAGE() could be forked when it is locked by other thread(s). + std::cerr << '\n' << ss.str() << '\n'; + _Exit(1); } + ScopedDefer _([&] { + // Push a dummy message to the message queue so that the stub + // process is notified that it can release the object stored in + // shared memory. + if (stub_message_queue_) { + stub_message_queue_->Push(DUMMY_MESSAGE); + } + + // If the model is not initialized, wait for the stub process to exit. + if (!is_initialized_) { + stub_message_queue_.reset(); + parent_message_queue_.reset(); + memory_manager_.reset(); + WaitForStubProcess(); + } + }); + + // The stub process would send two messages to the parent process during the + // initialization. + // 1. When the stub process's health monitoring thread has started. + // 2. When the initialization is fully completed and the Python model is + // loaded. + // + // The reason it is broken into two steps is that creation of the health + // monitoring thread may take longer which can make the server process think + // that the stub process is unhealthy and return early. Waiting with a longer + // timeout prevents this issue. + const uint64_t initialization_timeout_ms = 10000; // 10 sec LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, - (std::string("Starting Python backend stub: ") + bash_argument).c_str()); + "Waiting for the stub health monitoring thread to start"); - stub_args[2] = bash_argument.c_str(); + bi::managed_external_buffer::handle_t message; + auto err = ReceiveMessageFromStub(message, initialization_timeout_ms); + if (err != nullptr) { + KillStubProcess(); + } + + if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + if (err != nullptr) { + throw BackendModelException(err); + } + try { + AutocompleteStubProcess(); + } + catch (const PythonBackendException& ex) { + // Need to kill the stub process first + KillStubProcess(); + throw BackendModelException( + TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); + } + } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") { + RETURN_IF_ERROR(err); + RETURN_IF_ERROR(ModelInstanceStubProcess()); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Unknown stub_process_kind: ") + stub_process_kind_) + .c_str()); + } + + is_initialized_ = true; - int stub_status_code = - system((python_backend_stub + "> /dev/null 2>&1").c_str()); + return nullptr; +} +#else +TRITONSERVER_Error* +StubLauncher::Launch() +{ + std::string stub_name; + if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + stub_name = model_name_; + } else { + stub_name = model_instance_name_; + } + + if (!IsValidIdentifier(stub_name)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + "Invalid stub name: contains invalid characters"); + } - // If running stub process without any arguments returns any status code, - // other than 1, it can indicate a permission issue as a result of - // downloading the stub process from a cloud object storage service. - if (WEXITSTATUS(stub_status_code) != 1) { + if (!IsValidIdentifier(shm_region_name_)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + "Invalid shared memory region name: contains invalid characters"); + } + + // Default Python backend stub + std::string python_backend_stub = python_lib_ + "/triton_python_backend_stub"; + + // Path to alternative Python backend stub + std::string model_python_backend_stub = + std::string(model_repository_path_) + "/triton_python_backend_stub"; + + if (FileExists(model_python_backend_stub)) { + python_backend_stub = model_python_backend_stub; + } + + if (!IsExecutableFile(python_backend_stub)) { // Give the execute permission for the triton_python_backend_stub to the // owner. int error = chmod(python_backend_stub.c_str(), S_IXUSR); @@ -281,74 +393,227 @@ StubLauncher::Launch() } } - pid_t pid = fork(); - if (pid < 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "Failed to fork the stub process for auto-complete."); - } - if (pid == 0) { - // Replace this child process with the new stub process. - execvp("bash", (char**)stub_args); - // execvp() never return if succeeded. Otherwise, an error has occured. - std::stringstream ss; - ss << "Failed to run python backend stub. Errno = " << errno << '\n' - << "Python backend stub path: " << python_backend_stub << '\n' - << "Shared Memory Region Name: " << shm_region_name_ << '\n' - << "Shared Memory Default Byte Size: " << shm_default_byte_size_ << '\n' - << "Shared Memory Growth Byte Size: " << shm_growth_byte_size_ << '\n'; - // Print the error message directly because the underlying mutexes in - // LOG_MESSAGE() could be forked when it is locked by other thread(s). - std::cerr << '\n' << ss.str() << '\n'; - // Terminate the child execution immediately to avoid any issues. - _Exit(1); - } else { - ScopedDefer _([&] { - // Push a dummy message to the message queue so that the stub - // process is notified that it can release the object stored in - // shared memory. - stub_message_queue_->Push(DUMMY_MESSAGE); + // Prepare arguments for execution + std::vector arg_strings; + std::vector exec_args; - // If the model is not initialized, wait for the stub process to exit. - if (!is_initialized_) { - int status; - stub_message_queue_.reset(); - parent_message_queue_.reset(); - memory_manager_.reset(); - waitpid(stub_pid_, &status, 0); - } - }); + // This shared memory variable indicates whether the stub process should + // revert the LD_LIBRARY_PATH changes to avoid shared library issues in + // executables and libraries. + ipc_control_->uses_env = false; - stub_pid_ = pid; + if (python_execution_env_ != "") { + ipc_control_->uses_env = true; - if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { - try { - AutocompleteStubProcess(); - } - catch (const PythonBackendException& ex) { - // Need to kill the stub process first - kill(stub_pid_, SIGKILL); - int status; - waitpid(stub_pid_, &status, 0); - stub_pid_ = 0; - throw BackendModelException( - TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); - } - } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") { - RETURN_IF_ERROR(ModelInstanceStubProcess()); + // Parse environment variables from activation script + std::map env_vars = + ParseActivationScript(path_to_activate_); + + // Prepare environment with additional library path + auto [env_strings, custom_env] = + PrepareEnvironment(env_vars, path_to_libpython_); + + // Set up arguments for direct execution + arg_strings.push_back(python_backend_stub); + arg_strings.push_back(model_path_); + arg_strings.push_back(shm_region_name_); + arg_strings.push_back(std::to_string(shm_default_byte_size_)); + arg_strings.push_back(std::to_string(shm_growth_byte_size_)); + arg_strings.push_back(std::to_string(parent_pid_)); + arg_strings.push_back(python_lib_); + arg_strings.push_back(std::to_string(ipc_control_handle_)); + arg_strings.push_back(stub_name); + arg_strings.push_back(runtime_modeldir_); + + // Convert strings to char* array for exec + for (const auto& arg : arg_strings) { + exec_args.push_back(arg.c_str()); + } + exec_args.push_back(nullptr); // exec requires null termination + + // Log the command being executed + std::ostringstream log_cmd; + for (size_t i = 0; i < arg_strings.size(); ++i) { + if (i > 0) + log_cmd << " "; + log_cmd << "'" << arg_strings[i] << "'"; + } + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("Starting Python backend stub with custom environment: ") + + log_cmd.str()) + .c_str()); + + pid_t pid = fork(); + if (pid < 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "Failed to fork the stub process for auto-complete."); + } + if (pid == 0) { + // Replace this child process with the new stub process using custom + // environment + execve( + python_backend_stub.c_str(), const_cast(exec_args.data()), + custom_env.data()); + // execve() never returns if succeeded. Otherwise, an error has occurred. + std::stringstream ss; + ss << "Failed to run python backend stub with custom environment. Errno " + "= " + << errno << '\n' + << "Python backend stub path: " << python_backend_stub << '\n' + << "Activation script: " << path_to_activate_ << '\n' + << "Library path: " << path_to_libpython_ << '\n'; + std::cerr << '\n' << ss.str() << '\n'; + _Exit(1); } else { + stub_pid_ = pid; + } + + } else { + arg_strings.push_back(python_backend_stub); + arg_strings.push_back(model_path_); + arg_strings.push_back(shm_region_name_); + arg_strings.push_back(std::to_string(shm_default_byte_size_)); + arg_strings.push_back(std::to_string(shm_growth_byte_size_)); + arg_strings.push_back(std::to_string(parent_pid_)); + arg_strings.push_back(python_lib_); + arg_strings.push_back(std::to_string(ipc_control_handle_)); + arg_strings.push_back(stub_name); + arg_strings.push_back(runtime_modeldir_); + + // Convert strings to char* array for exec + for (const auto& arg : arg_strings) { + exec_args.push_back(arg.c_str()); + } + exec_args.push_back(nullptr); // exec requires null termination + + // Log the command being executed + std::ostringstream log_cmd; + for (size_t i = 0; i < arg_strings.size(); ++i) { + if (i > 0) + log_cmd << " "; + log_cmd << "'" << arg_strings[i] << "'"; + } + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("Starting Python backend stub: ") + log_cmd.str()) + .c_str()); + + pid_t pid = fork(); + if (pid < 0) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, - (std::string("Unknown stub_process_kind: ") + stub_process_kind_) - .c_str()); + "Failed to fork the stub process for auto-complete."); + } + if (pid == 0) { + // Replace this child process with the new stub process. + execv(python_backend_stub.c_str(), const_cast(exec_args.data())); + // execv() never returns if succeeded. Otherwise, an error has occurred. + std::stringstream ss; + ss << "Failed to run python backend stub. Errno = " << errno << '\n' + << "Python backend stub path: " << python_backend_stub << '\n'; + std::cerr << '\n' << ss.str() << '\n'; + _Exit(1); + } else { + stub_pid_ = pid; } + } + + ScopedDefer _([&] { + // Push a dummy message to the message queue so that the stub + // process is notified that it can release the object stored in + // shared memory. + if (stub_message_queue_) { + stub_message_queue_->Push(DUMMY_MESSAGE); + } + + // If the model is not initialized, wait for the stub process to exit. + if (!is_initialized_) { + stub_message_queue_.reset(); + parent_message_queue_.reset(); + memory_manager_.reset(); + WaitForStubProcess(); + } + }); + + // The stub process would send two messages to the parent process during the + // initialization. + // 1. When the stub process's health monitoring thread has started. + // 2. When the initialization is fully completed and the Python model is + // loaded. + // + // The reason it is broken into two steps is that creation of the health + // monitoring thread may take longer which can make the server process think + // that the stub process is unhealthy and return early. Waiting with a + // longer timeout prevents this issue. + const uint64_t initialization_timeout_ms = 10000; // 10 sec + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + "Waiting for the stub health monitoring thread to start"); + + bi::managed_external_buffer::handle_t message; + auto err = ReceiveMessageFromStub(message, initialization_timeout_ms); + if (err != nullptr) { + KillStubProcess(); + } - is_initialized_ = true; + if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + if (err != nullptr) { + throw BackendModelException(err); + } + try { + AutocompleteStubProcess(); + } + catch (const PythonBackendException& ex) { + // Need to kill the stub process first + KillStubProcess(); + throw BackendModelException( + TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); + } + } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") { + RETURN_IF_ERROR(err); + RETURN_IF_ERROR(ModelInstanceStubProcess()); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Unknown stub_process_kind: ") + stub_process_kind_) + .c_str()); } + is_initialized_ = true; + return nullptr; } +TRITONSERVER_Error* +StubLauncher::GetPythonEnvironment(ModelState* model_state) +{ + std::string python_execution_env = ""; + try { + python_execution_env = + model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( + python_execution_env_); + } + catch (PythonBackendException& pb_exception) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + } + + path_to_activate_ = python_execution_env + "/bin/activate"; + path_to_libpython_ = python_execution_env + "/lib"; + if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Path " + path_to_activate_ + + " does not exist. The Python environment should contain an " + "'activate' script.") + .c_str()); + } + return nullptr; +} +#endif + void StubLauncher::AutocompleteStubProcess() { @@ -428,8 +693,13 @@ StubLauncher::ModelInstanceStubProcess() initialize_message->Args() = initialize_map_handle; stub_message_queue_->Push(initialize_message->ShmHandle()); + const uint64_t initialization_timeout_ms = 5000; // 5 sec + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + "Waiting for the stub process initialization response"); + bi::managed_external_buffer::handle_t message; - RETURN_IF_ERROR(ReceiveMessageFromStub(message)); + RETURN_IF_ERROR(ReceiveMessageFromStub(message, initialization_timeout_ms)); std::unique_ptr initialize_response_message = IPCMessage::LoadFromSharedMemory(shm_pool_, message); @@ -465,6 +735,18 @@ StubLauncher::ModelInstanceStubProcess() return nullptr; } +bool +StubLauncher::StubActive() +{ +#ifdef _WIN32 + DWORD ec; + GetExitCodeProcess(stub_pid_.hProcess, &ec); + return (ec == STILL_ACTIVE); +#else + return (stub_pid_ != 0); +#endif +} + void StubLauncher::UpdateHealth() { @@ -475,9 +757,13 @@ StubLauncher::UpdateHealth() ipc_control_->stub_health = false; } - // Sleep 1 second so that the child process has a chance to change the - // health variable +// Sleep 1 second so that the child process has a chance to change the +// health variable +#ifdef _WIN32 + Sleep(1); +#else sleep(1); +#endif { bi::scoped_lock lock(*health_mutex_); @@ -507,11 +793,11 @@ StubLauncher::TerminateStub() force_kill = true; } - int status; if (force_kill) { - kill(stub_pid_, SIGKILL); + KillStubProcess(); + } else { + WaitForStubProcess(); } - waitpid(stub_pid_, &status, 0); } // First destroy the IPCControl. This makes sure that IPCControl is @@ -532,19 +818,25 @@ StubLauncher::ClearQueues() void StubLauncher::KillStubProcess() { +#ifdef _WIN32 + unsigned int exit_code; + TerminateProcess(stub_pid_.hProcess, exit_code); + CloseHandle(stub_pid_.hProcess); + CloseHandle(stub_pid_.hThread); +#else kill(stub_pid_, SIGKILL); - int status; - waitpid(stub_pid_, &status, 0); + WaitForStubProcess(); stub_pid_ = 0; +#endif } TRITONSERVER_Error* StubLauncher::ReceiveMessageFromStub( - bi::managed_external_buffer::handle_t& message) + bi::managed_external_buffer::handle_t& message, + uint64_t timeout_miliseconds) { bool success = false; while (!success) { - uint64_t timeout_miliseconds = 1000; { boost::posix_time::ptime timeout = boost::get_system_time() + @@ -591,4 +883,124 @@ StubLauncher::ReceiveMessageFromStub( return nullptr; // success } -}}}; // namespace triton::backend::python + +void +StubLauncher::WaitForStubProcess() +{ +#ifdef _WIN32 + WaitForSingleObject(stub_pid_.hProcess, INFINITE); + CloseHandle(stub_pid_.hProcess); + CloseHandle(stub_pid_.hThread); +#else + int status; + if (stub_pid_ != 0) { + // Added this check to ensure server doesn't hang waiting after stub + // process has already be killed and cannot be waited on + waitpid(stub_pid_, &status, 0); + } +#endif +} + +#ifdef TRITON_ENABLE_GPU +void +StubLauncher::ShareCUDAMemoryPool( + TRITONBACKEND_MemoryManager* triton_mem_manager, const int32_t device_id) +{ + std::lock_guard lock(cuda_shm_pool_mutex_); + if ((tried_sharing_cuda_pool_map_.find(device_id) != + tried_sharing_cuda_pool_map_.end()) && + tried_sharing_cuda_pool_map_[device_id]) { + return; + } + + std::unique_ptr ipc_message = + IPCMessage::Create(shm_pool_, true /* inline_response */); + CUDAMemPoolMessage* cuda_pool_message_ptr = nullptr; + PythonBackendException pb_exception(std::string{}); + + try { + // Create a dummy BackendMemory object to get the start address of the CUDA + // memory pool. + BackendMemory* backend_memory; + std::unique_ptr lbackend_memory; + + THROW_IF_TRITON_ERROR(BackendMemory::Create( + triton_mem_manager, BackendMemory::AllocationType::GPU_POOL, device_id, + 1 /* byte size*/, &backend_memory)); + lbackend_memory.reset(backend_memory); + + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + CUdeviceptr cuda_pool_address = 0; + cuda_api.PointerGetAttribute( + &cuda_pool_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + reinterpret_cast(lbackend_memory->MemoryPtr())); + + shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress( + device_id, reinterpret_cast(cuda_pool_address)); + shm_pool_->GetCUDAMemoryPoolManager()->SetTritonMemoryManager( + reinterpret_cast(triton_mem_manager)); + + // Get the memory handle from the CUDA memory pool. + AllocatedSharedMemory cuda_pool_message = + shm_pool_->Construct(); + cuda_pool_message_ptr = cuda_pool_message.data_.get(); + { + ScopedSetDevice scoped_set_device(device_id); + THROW_IF_CUDA_ERROR(cudaIpcGetMemHandle( + reinterpret_cast( + &cuda_pool_message_ptr->cuda_handle), + reinterpret_cast(shm_pool_->GetCUDAMemoryPoolManager() + ->CUDAPoolAddress(device_id)))); + } + + ipc_message->Command() = PYTHONSTUB_CUDAPoolInitializeRequest; + ipc_message->Args() = cuda_pool_message.handle_; + + cuda_pool_message_ptr->device_id = device_id; + cuda_pool_message_ptr->has_error = false; + cuda_pool_message_ptr->is_error_set = false; + cuda_pool_message_ptr->waiting_on_stub = false; + + { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + parent_to_stub_mq_->Push(ipc_message->ShmHandle()); + while (!cuda_pool_message_ptr->waiting_on_stub) { + ipc_message->ResponseCondition()->wait(lock); + } + } + + if (cuda_pool_message_ptr->has_error) { + if (cuda_pool_message_ptr->is_error_set) { + std::unique_ptr error_message = + PbString::LoadFromSharedMemory( + shm_pool_, cuda_pool_message_ptr->error); + throw PythonBackendException(error_message->String()); + } else { + throw PythonBackendException( + "Failed to share CUDA memory pool with stub process: " + + model_name_); + } + } + } + catch (const PythonBackendException& exception) { + shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress( + device_id, nullptr); + pb_exception = exception; + } + + { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + cuda_pool_message_ptr->waiting_on_stub = false; + ipc_message->ResponseCondition()->notify_all(); + } + + tried_sharing_cuda_pool_map_[device_id] = true; + + if (pb_exception.what() != std::string{""}) { + throw pb_exception; + } +} +#endif // TRITON_ENABLE_GPU +}}}; // namespace triton::backend::python diff --git a/src/stub_launcher.h b/src/stub_launcher.h index c3da400b..58cdcc61 100644 --- a/src/stub_launcher.h +++ b/src/stub_launcher.h @@ -26,7 +26,6 @@ #pragma once -#include #include #include #include @@ -40,6 +39,7 @@ #include #include #include + #include "ipc_message.h" #include "memory_manager.h" #include "message_queue.h" @@ -77,8 +77,8 @@ class StubLauncher { // Model instance stub process TRITONSERVER_Error* ModelInstanceStubProcess(); - // Stub PID - pid_t StubPid() { return stub_pid_; } + // Check if Stub PID is active + bool StubActive(); // Health mutex bi::interprocess_mutex* HealthMutex() { return health_mutex_; } @@ -147,18 +147,39 @@ class StubLauncher { // Get a message from the stub process TRITONSERVER_Error* ReceiveMessageFromStub( - bi::managed_external_buffer::handle_t& message); + bi::managed_external_buffer::handle_t& message, + uint64_t timeout_miliseconds = 1000); + + // Wait for stub process + void WaitForStubProcess(); + +#ifndef _WIN32 + // FIXME [DLIS-5969]: Enable for Windows when custom execution environments + // are supported. + TRITONSERVER_Error* GetPythonEnvironment(ModelState* model_state); +#endif +#ifdef TRITON_ENABLE_GPU + // Share CUDA memory pool with stub process + void ShareCUDAMemoryPool( + TRITONBACKEND_MemoryManager* triton_mem_manager, const int32_t device_id); +#endif // TRITON_ENABLE_GPU private: +#ifdef _WIN32 + STARTUPINFO startup_info_; + DWORD parent_pid_; + PROCESS_INFORMATION stub_pid_; +#else pid_t parent_pid_; pid_t stub_pid_; - +#endif bool is_initialized_; bool is_decoupled_; bool is_healthy_; std::string shm_region_name_; std::string model_repository_path_; std::string model_path_; + std::string runtime_modeldir_; const std::string stub_process_kind_; std::string model_name_; const std::string model_instance_name_; @@ -193,5 +214,9 @@ class StubLauncher { ipc_control_; bi::managed_external_buffer::handle_t ipc_control_handle_; std::unique_ptr shm_pool_; +#ifdef TRITON_ENABLE_GPU + std::mutex cuda_shm_pool_mutex_; + std::unordered_map tried_sharing_cuda_pool_map_; +#endif // TRITON_ENABLE_GPU }; }}} // namespace triton::backend::python