diff --git a/.clang-format b/.clang-format index 1983a9ca..12bb2f11 100644 --- a/.clang-format +++ b/.clang-format @@ -59,6 +59,7 @@ PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left +QualifierAlignment: Right ReflowComments: true SeparateDefinitionBlocks: Always SortIncludes: CaseSensitive diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 00000000..21518a54 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,117 @@ +name: "Bug Report" +description: Submit a bug report to help us improve TensorRT-LLM backend +labels: [ "bug" ] +body: + - type: textarea + id: system-info + attributes: + label: System Info + description: Please share your system info with us. + placeholder: | + - CPU architecture (e.g., x86_64, aarch64) + - CPU/Host memory size (if known) + - GPU properties + - GPU name (e.g., NVIDIA H100, NVIDIA A100, NVIDIA L40S) + - GPU memory size (if known) + - Clock frequencies used (if applicable) + - Libraries + - TensorRT-LLM branch or tag (e.g., main, v0.7.1) + - TensorRT-LLM commit (if known) + - Versions of TensorRT, AMMO, CUDA, cuBLAS, etc. used + - Container used (if running TensorRT-LLM in a container) + - NVIDIA driver version + - OS (Ubuntu 22.04, CentOS 7, Windows 10) + - Docker image version + - Any other information that may be useful in reproducing the bug + validations: + required: true + + - type: textarea + id: who-can-help + attributes: + label: Who can help? + description: | + To expedite the response to your issue, it would be helpful if you could identify the appropriate person + to tag using the **@** symbol. Here is a general guideline on **whom to tag**. + + Rest assured that all issues are reviewed by the core maintainers. If you are unsure about whom to tag, + you can leave it blank, and a core maintainer will make sure to involve the appropriate person. + + Please tag fewer than 3 people. + + Quantization: @Tracin + + Documentation: @juney-nvidia + + Feature request: @ncomly-nvidia + + Performance: @kaiyux + + Others: @byshiue @schetlur-nv + + placeholder: "@Username ..." + + - type: checkboxes + id: information-scripts-examples + attributes: + label: Information + description: 'The problem arises when using:' + options: + - label: "The official example scripts" + - label: "My own modified scripts" + + - type: checkboxes + id: information-tasks + attributes: + label: Tasks + description: "The tasks I am working on are:" + options: + - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)" + - label: "My own task or dataset (give details below)" + + - type: textarea + id: reproduction + validations: + required: true + attributes: + label: Reproduction + description: | + Kindly share a code example that demonstrates the issue you encountered. It is recommending to provide a code snippet directly. + Additionally, if you have any error messages, or stack traces related to the problem, please include them here. + + Remember to use code tags to properly format your code. You can refer to the + link https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting for guidance on code formatting. + + Please refrain from using screenshots, as they can be difficult to read and prevent others from copying and pasting your code. + It would be most helpful if we could reproduce your issue by simply copying and pasting your scripts and codes. + + placeholder: | + Steps to reproduce the behavior: + + 1. + 2. + 3. + + - type: textarea + id: expected-behavior + validations: + required: true + attributes: + label: Expected behavior + description: "Provide a brief summary of the expected behavior of the software. Provide output files or examples if possible." + + - type: textarea + id: actual-behavior + validations: + required: true + attributes: + label: actual behavior + description: "Describe the actual behavior of the software and how it deviates from the expected behavior. Provide output files or examples if possible." + + - type: textarea + id: additioanl-notes + validations: + required: true + attributes: + label: additional notes + description: "Provide any additional context here you think might be useful for the TensorRT-LLM team to help debug this issue (such as experiments done, potential things to investigate)." diff --git a/.gitignore b/.gitignore index a7116d55..a8cb1c8d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,6 @@ build/ *.so *.egg-info/ .coverage -*.csv *.onnx tmp/ +.idea diff --git a/.gitmodules b/.gitmodules index 0e5eaa77..70ad46f3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "tensorrt_llm"] path = tensorrt_llm - url = git@github.com:NVIDIA/TensorRT-LLM.git + url = https://github.com/NVIDIA/TensorRT-LLM.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9ee1c078..25ba2b1f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,8 +7,8 @@ repos: rev: v1.1.13 hooks: - id: remove-crlf -- repo: https://github.com/pre-commit/mirrors-yapf - rev: v0.32.0 +- repo: https://github.com/google/yapf + rev: v0.43.0 hooks: - id: yapf - repo: https://github.com/pre-commit/pre-commit-hooks @@ -40,3 +40,11 @@ repos: rev: v0.6.10 hooks: - id: cmake-format +- repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell + exclude: tools/dataset/ + args: + - --skip=".git,tensorrt_llm" + - --exclude-file=all_models/whisper/whisper_bls/1/tokenizer.py diff --git a/README.md b/README.md index c3f30f6f..dd98c04d 100644 --- a/README.md +++ b/README.md @@ -1,196 +1,857 @@ + + # TensorRT-LLM Backend The Triton backend for [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). +You can learn more about Triton backends in the [backend repo](https://github.com/triton-inference-server/backend). +The goal of TensorRT-LLM Backend is to let you serve [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) +models with Triton Inference Server. The [inflight_batcher_llm](./inflight_batcher_llm/) +directory contains the C++ implementation of the backend supporting inflight +batching, paged attention and more. + +> [!NOTE] +> +> Please note that the Triton backend source code and test have been moved +> to [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) under the +> `triton_backend` directory. + +Where can I ask general questions about Triton and Triton backends? +Be sure to read all the information below as well as the [general +Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server) +available in the main [server](https://github.com/triton-inference-server/server) +repo. If you don't find your answer there you can ask questions on the +[issues page](https://github.com/triton-inference-server/tensorrtllm_backend/issues). + +## Table of Contents +- [TensorRT-LLM Backend](#tensorrt-llm-backend) + - [Table of Contents](#table-of-contents) + - [Getting Started](#getting-started) + - [Quick Start](#quick-start) + - [Launch Triton TensorRT-LLM container](#launch-triton-tensorrt-llm-container) + - [Prepare TensorRT-LLM engines](#prepare-tensorrt-llm-engines) + - [Prepare the Model Repository](#prepare-the-model-repository) + - [Modify the Model Configuration](#modify-the-model-configuration) + - [Serving with Triton](#serving-with-triton) + - [Send an Inference Request](#send-an-inference-request) + - [Using the generate endpoint](#using-the-generate-endpoint) + - [Using the client scripts](#using-the-client-scripts) + - [Early stopping](#early-stopping) + - [Return context logits and/or generation logits](#return-context-logits-andor-generation-logits) + - [Requests with batch size \> 1](#requests-with-batch-size--1) + - [Building from Source](#building-from-source) + - [Supported Models](#supported-models) + - [Model Config](#model-config) + - [Model Deployment](#model-deployment) + - [TRT-LLM Multi-instance Support](#trt-llm-multi-instance-support) + - [Leader Mode](#leader-mode) + - [Orchestrator Mode](#orchestrator-mode) + - [Running Multiple Instances of LLaMa Model](#running-multiple-instances-of-llama-model) + - [Multi-node Support](#multi-node-support) + - [Model Parallelism](#model-parallelism) + - [Tensor Parallelism, Pipeline Parallelism and Expert Parallelism](#tensor-parallelism-pipeline-parallelism-and-expert-parallelism) + - [MIG Support](#mig-support) + - [Scheduling](#scheduling) + - [Key-Value Cache](#key-value-cache) + - [Decoding](#decoding) + - [Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search, Medusa, ReDrafter, Lookahead and Eagle](#decoding-modes---top-k-top-p-top-k-top-p-beam-search-medusa-redrafter-lookahead-and-eagle) + - [Speculative Decoding](#speculative-decoding) + - [Chunked Context](#chunked-context) + - [Quantization](#quantization) + - [LoRa](#lora) + - [Launch Triton server *within Slurm based clusters*](#launch-triton-server-within-slurm-based-clusters) + - [Prepare some scripts](#prepare-some-scripts) + - [Submit a Slurm job](#submit-a-slurm-job) + - [Triton Metrics](#triton-metrics) + - [Benchmarking](#benchmarking) + - [Testing the TensorRT-LLM Backend](#testing-the-tensorrt-llm-backend) + +## Getting Started + +### Quick Start + +Below is an example of how to serve a TensorRT-LLM model with the Triton +TensorRT-LLM Backend on a 4-GPU environment. The example uses the GPT model from +the +[TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/tree/v0.11.0/examples/gpt) +with the +[NGC Triton TensorRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver). +Make sure you are cloning the same version of TensorRT-LLM backend as the +version of TensorRT-LLM in the container. Please refer to the +[support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) +to see the aligned versions. + +In this example, we will use Triton 24.07 with TensorRT-LLM v0.11.0. + + +#### Launch Triton TensorRT-LLM container + +Launch Triton docker container `nvcr.io/nvidia/tritonserver:-trtllm-python-py3` +with TensorRT-LLM backend. + +Make an `engines` folder outside docker to reuse engines for future runs. Make +sure to replace the `` with the version of Triton that you want to use. + +```bash +docker run --rm -it --net host --shm-size=2g \ + --ulimit memlock=-1 --ulimit stack=67108864 --gpus all \ + -v :/engines \ + nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 +``` + +#### Prepare TensorRT-LLM engines -## Introduction +You can skip this step if you already have the engines ready. +Follow the [guide](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) in +TensorRT-LLM repository for more details on how to to prepare the engines for +all the supported models. You can also check out the +[tutorials](https://github.com/triton-inference-server/tutorials) to see more +examples with serving TensorRT-LLM models. -This document describes how to serve models by TensorRT-LLM Triton backend. This backend is only an interface to call TensorRT-LLM in Triton. The heavy lifting, in terms of implementation, can be found in the TensorRT-LLM source code. +```bash +cd /app/tensorrt_llm/examples/gpt + +# Download weights from HuggingFace Transformers +rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 +pushd gpt2 && rm pytorch_model.bin model.safetensors && wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin && popd + +# Convert weights from HF Tranformers to TensorRT-LLM checkpoint +python3 convert_checkpoint.py --model_dir gpt2 \ + --dtype float16 \ + --tp_size 4 \ + --output_dir ./c-model/gpt2/fp16/4-gpu + +# Build TensorRT engines +trtllm-build --checkpoint_dir ./c-model/gpt2/fp16/4-gpu \ + --gpt_attention_plugin float16 \ + --remove_input_padding enable \ + --kv_cache_type paged \ + --gemm_plugin float16 \ + --output_dir /engines/gpt/fp16/4-gpu +``` + +See [here](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt) for +more details on the parameters. + +#### Prepare the Model Repository + +Next, create the +[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md) +that will be used by the Triton server. The models can be found in the +[all_models](https://github.com/NVIDIA/TensorRT-LLM/blob/main/triton_backend/all_models) folder. The folder contains two groups of models: +- [`gpt`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/triton_backend/all_models/gpt): Using TensorRT-LLM pure Python runtime. +- [`inflight_batcher_llm`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/triton_backend/all_models/inflight_batcher_llm/)`: Using the C++ +TensorRT-LLM backend with the executor API, which includes the latest features +including inflight batching. + +There are five models in +[all_models/inflight_batcher_llm](https://github.com/NVIDIA/TensorRT-LLM/blob/main/triton_backend/all_models/inflight_batcher_llm) that will +be used in this example: + +| Model | Description | +| :------------: | :---------------: | +| `ensemble` | This model is used to chain the preprocessing, tensorrt_llm and postprocessing models together. | +| `preprocessing` | This model is used for tokenizing, meaning the conversion from prompts(string) to input_ids(list of ints). | +| `tensorrt_llm` | This model is a wrapper of your TensorRT-LLM model and is used for inferencing. Input specification can be found [here](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/inference-request.md) | +| `postprocessing` | This model is used for de-tokenizing, meaning the conversion from output_ids(list of ints) to outputs(string). | +| `tensorrt_llm_bls` | This model can also be used to chain the preprocessing, tensorrt_llm and postprocessing models together. | + +To learn more about ensemble and BLS models, please see the +[Ensemble Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models) +and +[Business Logic Scripting](https://github.com/triton-inference-server/python_backend#business-logic-scripting) +documentation. + +To learn more about the benefits and the limitations of using the BLS model, +please see the [model config](./docs/model_config.md#tensorrt_llm_bls-model) section. -## Setup Environment +```bash +mkdir /triton_model_repo +cp -r /app/all_models/inflight_batcher_llm/* /triton_model_repo/ +``` -### Prepare the repository +#### Modify the Model Configuration +Use the script to fill in the parameters in the model configuration files. For +optimal performance or custom parameters, please refer to +[perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md). +For more details on the model configuration and the parameters that can be +modified, please refer to the [model config](./docs/model_config.md) section. -Clone the repository, and update submodules recursively. +```bash +ENGINE_DIR=/engines/gpt/fp16/4-gpu +TOKENIZER_DIR=/app/tensorrt_llm/examples/gpt/gpt2 +MODEL_FOLDER=/triton_model_repo +TRITON_MAX_BATCH_SIZE=4 +INSTANCE_COUNT=1 +MAX_QUEUE_DELAY_MS=0 +MAX_QUEUE_SIZE=0 +FILL_TEMPLATE_SCRIPT=/app/tools/fill_template.py +DECOUPLED_MODE=false +LOGITS_DATATYPE=TYPE_FP32 + +python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:${LOGITS_DATATYPE} +python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${INSTANCE_COUNT} +python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},engine_dir:${ENGINE_DIR},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MS},batching_strategy:inflight_fused_batching,max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:TYPE_FP16,logits_datatype:${LOGITS_DATATYPE} +python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${INSTANCE_COUNT} +python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},bls_instance_count:${INSTANCE_COUNT},logits_datatype:${LOGITS_DATATYPE} +``` + +> **NOTE**: +It is recommended to match the number of pre/post_instance_counts with triton_max_batch_size for better performance. + +#### Serving with Triton + +Now, you're ready to launch the Triton server with the TensorRT-LLM model. + +Use the launch_triton_server.py script. This launches multiple instances of tritonserver with MPI. + +```bash +# 'world_size' is the number of GPUs you want to use for serving. This should +# be aligned with the number of GPUs used to build the TensorRT-LLM engine. +python3 /app/scripts/launch_triton_server.py --world_size=4 --model_repo=${MODEL_FOLDER} +``` + +You should see the following logs when the server is successfully deployed. + +```bash +... +I0503 22:01:25.210518 1175 grpc_server.cc:2463] Started GRPCInferenceService at 0.0.0.0:8001 +I0503 22:01:25.211612 1175 http_server.cc:4692] Started HTTPService at 0.0.0.0:8000 +I0503 22:01:25.254914 1175 http_server.cc:362] Started Metrics Service at 0.0.0.0:8002 +``` + +To stop Triton Server inside the container, run: + +```bash +pkill tritonserver +``` + +#### Send an Inference Request + +##### Using the [generate endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_generate.md) + +The general format of the generate endpoint: +```bash +curl -X POST localhost:8000/v2/models/${MODEL_NAME}/generate -d '{"{PARAM1_KEY}": "{PARAM1_VALUE}", ... }' +``` + +In the case of the models used in this example, you can replace MODEL_NAME with +`ensemble` or `tensorrt_llm_bls`. Examining the ensemble and tensorrt_llm_bls +model's config.pbtxt file, you can see that 4 parameters are required to +generate a response for this model: + +- text_input: Input text to generate a response from +- max_tokens: The number of requested output tokens +- bad_words: A list of bad words (can be empty) +- stop_words: A list of stop words (can be empty) + +Therefore, we can query the server in the following way: + +- if using the ensemble model +```bash +curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": ""}' ``` -git clone git@github.com:triton-inference-server/tensorrtllm_backend.git -git submodule update --init --recursive -git lfs install -git lfs pull + +- if using the tensorrt_llm_bls model + +```bash +curl -X POST localhost:8000/v2/models/tensorrt_llm_bls/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": ""}' ``` -### Build the Docker image. +Which should return a result similar to (formatted for readability): +```bash +{ + "model_name": "ensemble", + "model_version": "1", + "sequence_end": false, + "sequence_id": 0, + "sequence_start": false, + "text_output": "What is machine learning?\n\nMachine learning is a method of learning by using machine learning algorithms to solve problems.\n\n" +} ``` -cd tensorrtllm_backend -docker build -f dockerfile/Dockerfile.trt_llm_backend -t tritonserver:w_trt_llm_backend . + +##### Using the client scripts + +You can refer to the client scripts in the +[inflight_batcher_llm/client](https://github.com/NVIDIA/TensorRT-LLM/blob/main/triton_backend/inflight_batcher_llm/client) to see how to send +requests via Python scripts. + +Below is an example of using +[inflight_batcher_llm_client](https://github.com/NVIDIA/TensorRT-LLM/blob/main/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py) +to send requests to the `tensorrt_llm` model. + +```bash +pip3 install tritonclient[all] +INFLIGHT_BATCHER_LLM_CLIENT=/app/inflight_batcher_llm/client/inflight_batcher_llm_client.py +python3 ${INFLIGHT_BATCHER_LLM_CLIENT} --request-output-len 200 --tokenizer-dir ${TOKENIZER_DIR} ``` -The rest of the documentation assumes that the Docker image has already been built. +The result should be similar to the following: -### How to select the models -There are two models under `all_models/`: -- gpt: A Python implementation of the TensorRT-LLM Triton backend -- inflight_batcher_llm: A C++ implementation of the TensorRT-LLM Triton backend +```bash +Using pad_id: 50256 +Using end_id: 50256 +Input sequence: [28524, 287, 5093, 12, 23316, 4881, 11, 30022, 263, 8776, 355, 257] +Got completed request +Input: Born in north-east France, Soyer trained as a +Output beam 0: chef before moving to London in the early 1990s. He has since worked in restaurants in London, Paris, Milan and New York. -### Prepare TensorRT-LLM engines -Follow the [guide](https://github.com/NVIDIA/TensorRT-LLM/blob/main/README.md) in TensorRT-LLM to prepare the engines for deployment. +He is married to the former model and actress, Anna-Marie, and has two children, a daughter, Emma, and a son, Daniel. -For example, please find the details in the document of TensorRT-LLM GPT for instrutions to build GPT engines: [link](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt#usage) +Soyer's wife, Anna-Marie, is a former model and actress. -### How to set the model configuration +He is survived by his wife, Anna-Marie, and their two children, Daniel and Emma. -**TensorRT-LLM Triton Serving Configuration: config.pbtxt** +Soyer was born in the north-east of France, and moved to London in the early 1990s. -- This will be loaded by Triton servers -- This mainly describes the server and TensorRT-LLM inference hyperparameters. +He was a chef at the London restaurant, The Bistro, before moving to New York in the early 2000s. -There are several components in each implemented backend, and there is a config.pbtxt for each component, take `all_models/inflight_batcher_llm` as an example: -- preprocessing: Used for tokenizing. -- tensorrt_llm: Inferencing. -- postprocessing: Used for de-tokenizing. -- ensemble: Connect preprocessing -> tensorrt_llm -> postprocessing +He was a regular at the restaurant, and was also a regular at the restaurant, The Bistro, before moving to London in the early 2000s. -The following table shows the fields that need to be modified before deployment: +Soyer was a regular at the restaurant, and was +Output sequence: [28524, 287, 5093, 12, 23316, 4881, 11, 30022, 263, 8776, 355, 257, 21221, 878, 3867, 284, 3576, 287, 262, 1903, 6303, 82, 13, 679, 468, 1201, 3111, 287, 10808, 287, 3576, 11, 6342, 11, 21574, 290, 968, 1971, 13, 198, 198, 1544, 318, 6405, 284, 262, 1966, 2746, 290, 14549, 11, 11735, 12, 44507, 11, 290, 468, 734, 1751, 11, 257, 4957, 11, 18966, 11, 290, 257, 3367, 11, 7806, 13, 198, 198, 50, 726, 263, 338, 3656, 11, 11735, 12, 44507, 11, 318, 257, 1966, 2746, 290, 14549, 13, 198, 198, 1544, 318, 11803, 416, 465, 3656, 11, 11735, 12, 44507, 11, 290, 511, 734, 1751, 11, 7806, 290, 18966, 13, 198, 198, 50, 726, 263, 373, 4642, 287, 262, 5093, 12, 23316, 286, 4881, 11, 290, 3888, 284, 3576, 287, 262, 1903, 6303, 82, 13, 198, 198, 1544, 373, 257, 21221, 379, 262, 3576, 7072, 11, 383, 347, 396, 305, 11, 878, 3867, 284, 968, 1971, 287, 262, 1903, 4751, 82, 13, 198, 198, 1544, 373, 257, 3218, 379, 262, 7072, 11, 290, 373, 635, 257, 3218, 379, 262, 7072, 11, 383, 347, 396, 305, 11, 878, 3867, 284, 3576, 287, 262, 1903, 4751, 82, 13, 198, 198, 50, 726, 263, 373, 257, 3218, 379, 262, 7072, 11, 290, 373] +``` -*all_models/inflight_batcher_llm/preprocessing/config.pbtxt* +###### Early stopping -| Name | Description -| :----------------------: | :-----------------------------: | -| `tokenizer_dir` | The path to the tokenizer for the model | -| `tokenizer_type` | The type of the tokenizer for the model, t5, auto and llama are supported | +You can also stop the generation process early by using the `--stop-after-ms` +option to send a stop request after a few milliseconds: -*all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt* +```bash +python3 ${INFLIGHT_BATCHER_LLM_CLIENT} --stop-after-ms 200 --request-output-len 200 --request-id 1 --tokenizer-dir ${TOKENIZER_DIR} +``` -| Name | Description -| :----------------------: | :-----------------------------: | -| `decoupled` | Controls streaming. Decoupled mode must be set to true if using the streaming option from the client. | -| `gpt_model_type` | "inflight_fused_batching" or "V1" (disable in-flight batching) | -| `gpt_model_path` | Path to the TensorRT-LLM engines for deployment | +You will find that the generation process is stopped early and therefore the +number of generated tokens is lower than 200. You can have a look at the +client code to see how early stopping is achieved. -*all_models/inflight_batcher_llm/postprocessing/config.pbtxt* +###### Return context logits and/or generation logits -| Name | Description -| :----------------------: | :-----------------------------: | -| `tokenizer_dir` | The path to the tokenizer for the model | -| `tokenizer_type` | The type of the tokenizer for the model, t5, auto and llama are supported | +If you want to get context logits and/or generation logits, you need to enable +`--gather_context_logits` and/or `--gather_generation_logits` when building the +engine (or `--gather_all_token_logits` to enable both at the same time). For +more setting details about these two flags, please refer to +[build.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/commands/build.py) +or +[gpt_runtime](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-runtime.md). -## Run Serving on Single Node +After launching the server, you could get the output of logits by passing the +corresponding parameters `--return-context-logits` and/or +`--return-generation-logits` in the client scripts +([end_to_end_grpc_client.py](./tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py) +and +[inflight_batcher_llm_client.py](./tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py)). -### Launch the backend *within Docker* +For example: ```bash -# 1. Pull the docker image -nvidia-docker run -it --rm -e LOCAL_USER_ID=`id -u ${USER}` --shm-size=2g -v : bash +python3 ${INFLIGHT_BATCHER_LLM_CLIENT} --request-output-len 20 --tokenizer-dir ${TOKENIZER_DIR} --return-context-logits --return-generation-logits +``` -# 2. Modify parameters: -1. all_models//tensorrt_llm/config.pbtxt -2. all_models//preprocessing/config.pbtxt -3. all_models//postprocessing/config.pbtxt +The result should be similar to the following: -# 3. Launch triton server -python3 scripts/launch_triton_server.py --world_size= \ - --model_repo=all_models/ +```bash +Input sequence: [28524, 287, 5093, 12, 23316, 4881, 11, 30022, 263, 8776, 355, 257] +Got completed request +Input: Born in north-east France, Soyer trained as a +Output beam 0: has since worked in restaurants in London, +Output sequence: [21221, 878, 3867, 284, 3576, 287, 262, 1903, 6303, 82, 13, 679, 468, 1201, 3111, 287, 10808, 287, 3576, 11] +context_logits.shape: (1, 12, 50257) +context_logits: [[[ -65.9822 -62.267445 -70.08991 ... -76.16964 -78.8893 + -65.90678 ] + [-103.40278 -102.55243 -106.119026 ... -108.925415 -109.408585 + -101.37687 ] + [ -63.971176 -64.03466 -67.58809 ... -72.141235 -71.16892 + -64.23846 ] + ... + [ -80.776375 -79.1815 -85.50916 ... -87.07368 -88.02817 + -79.28435 ] + [ -10.551408 -7.786484 -14.524468 ... -13.805856 -15.767286 + -7.9322424] + [-106.33096 -105.58956 -111.44852 ... -111.04858 -111.994194 + -105.40376 ]]] +generation_logits.shape: (1, 1, 20, 50257) +generation_logits: [[[[-106.33096 -105.58956 -111.44852 ... -111.04858 -111.994194 + -105.40376 ] + [ -77.867424 -76.96638 -83.119095 ... -87.82542 -88.53957 + -75.64877 ] + [-136.92282 -135.02484 -140.96051 ... -141.78284 -141.55045 + -136.01668 ] + ... + [-100.03721 -98.98237 -105.25507 ... -108.49254 -109.45882 + -98.95136 ] + [-136.78777 -136.16165 -139.13437 ... -142.21495 -143.57468 + -134.94667 ] + [ 19.222942 19.127287 14.804495 ... 10.556551 9.685863 + 19.625107]]]] ``` -### Launch the backend *within Slurm based clusters* -1. Prepare some scripts +##### Requests with batch size > 1 + +The TRT-LLM backend supports requests with batch size greater than one. When +sending a request with a batch size greater than one, the TRT-LLM backend will +return multiple batch size 1 responses, where each response will be associated +with a given batch index. An output tensor named `batch_index` is associated +with each response to indicate which batch index this response corresponds to. + +The client script +[end_to_end_grpc_client.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py) +demonstrates how a client can send requests with batch size > 1 and consume the +responses returned from Triton. When passing `--batch-inputs` to the client +script, the client will create a request with multiple prompts, and use the +`batch_index` output tensor to associate the responses to the original prompt. +For example one could run: + +``` +python3 /app/inflight_batcher_llm/client/end_to_end_grpc_client.py -o 5 -p '["This is a test","I want you to","The cat is"]' --batch-inputs +``` + +to send a request with a batch size of 3 to the Triton server. + +## Building from Source + +Please refer to the [build.md](./docs/build.md) for more details on how to +build the Triton TRT-LLM container from source. + +## Supported Models + +Only a few examples are listed here. For all the supported models, please refer +to the [support matrix](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html). + +- LLaMa + - [End to end workflow to run llama 7b with Triton](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/llama.md) + - [Build and run a LLaMA model in TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama) + - [Llama Multi-instance](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/llama_multi_instance.md) + - [Deploying Hugging Face Llama2-7b Model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Popular_Models_Guide/Llama2/trtllm_guide.md#infer-with-tensorrt-llm-backend) + +- Gemma + - [End to end workflow to run sp model with Triton](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/gemma.md) + - [Run Gemma on TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gemma) + +- Mistral + - [Build and run a Mixtral model in TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/mixtral/README.md) + +- Multi-modal + - [End to end workflow to run multimodal models(e.g. BLIP2-OPT, LLava1.5-7B, VILA) with Triton](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/multimodal.md) + - [Deploying Hugging Face Llava1.5-7b Model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Popular_Models_Guide/Llava1.5/llava_trtllm_guide.md) + +- Encoder-Decoder + - [End to end workflow to run an Encoder-Decoder model with Triton](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/encoder_decoder.md) + +## Model Config + +Please refer to the [model config](./docs/model_config.md) for more details on +the model configuration. + +## Model Deployment + +### TRT-LLM Multi-instance Support + +TensorRT-LLM backend relies on MPI to coordinate the execution of a model across +multiple GPUs and nodes. Currently, there are two different modes supported to +run a model across multiple GPUs, **Leader Mode** and **Orchestrator Mode**. + +> **Note**: This is different from the model multi-instance support from Triton +> Server which allows multiple instances of a model to be run on the same or +> different GPUs. For more information on Triton Server multi-instance support, +> please refer to the +> [Triton model config documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups). + +#### Leader Mode + +In leader mode, TensorRT-LLM backend spawns one Triton Server process for every +GPU. The process with rank 0 is the leader process. Other Triton Server processes, +do not return from the `TRITONBACKEND_ModelInstanceInitialize` call to avoid +port collision and allowing the other processes to receive requests. + +The overview of this mode is described in the diagram below: + +![Leader Mode Overview](./images/leader-mode.png) + +This mode is friendly with [slurm](https://slurm.schedmd.com) deployments since +it doesn't use +[MPI_Comm_spawn](https://www.open-mpi.org/doc/v4.1/man3/MPI_Comm_spawn.3.php). + +#### Orchestrator Mode + +In orchestrator mode, the TensorRT-LLM backend spawns a single Triton Server +process that acts as an orchestrator and spawns one Triton Server process for +every GPU that each model requires. This mode is mainly used when serving +multiple models with TensorRT-LLM backend. In this mode, the `MPI` world size +must be one as TRT-LLM backend will automatically create new workers as needed. +The overview of this mode is described in the diagram below: + +![Orchestrator Mode Overview](./images/orchestrator-mode.png) + +Since this mode uses +[MPI_Comm_spawn](https://www.open-mpi.org/doc/v4.1/man3/MPI_Comm_spawn.3.php), +it might not work properly with [slurm](https://slurm.schedmd.com) deployments. +Additionally, this currently only works for single node deployments. + +#### Running Multiple Instances of LLaMa Model + +Please refer to +[Running Multiple Instances of the LLaMa Model](docs/llama_multi_instance.md) +for more information on running multiple instances of LLaMa model in different +configurations. + +### Multi-node Support + +Check out the +[Multi-Node Generative AI w/ Triton Server and TensorRT-LLM](https://github.com/triton-inference-server/tutorials/tree/main/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models) +tutorial for Triton Server and TensorRT-LLM multi-node deployment. + +### Model Parallelism + +#### Tensor Parallelism, Pipeline Parallelism and Expert Parallelism + +[Tensor Parallelism](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html#tensor-parallelism), +[Pipeline Parallelism](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html#pipeline-parallelism) +and +[Expert parallelism](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html#expert-parallelism) +are supported in TensorRT-LLM. + +See the models in the +[examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) folder for +more details on how to build the engines with tensor parallelism, pipeline +parallelism and expert parallelism. + +Some examples are shown below: + +- Build LLaMA v3 70B using 4-way tensor parallelism and 2-way pipeline parallelism. + +```bash +python3 convert_checkpoint.py --model_dir ./tmp/llama/70B/hf/ \ + --output_dir ./tllm_checkpoint_8gpu_tp4_pp2 \ + --dtype float16 \ + --tp_size 4 \ + --pp_size 2 + +trtllm-build --checkpoint_dir ./tllm_checkpoint_8gpu_tp4_pp2 \ + --output_dir ./tmp/llama/70B/trt_engines/fp16/8-gpu/ \ + --gemm_plugin auto +``` + +- Build Mixtral8x22B with tensor parallelism and expert parallelism + +```bash +python3 ../llama/convert_checkpoint.py --model_dir ./Mixtral-8x22B-v0.1 \ + --output_dir ./tllm_checkpoint_mixtral_8gpu \ + --dtype float16 \ + --tp_size 8 \ + --moe_tp_size 2 \ + --moe_ep_size 4 +trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_8gpu \ + --output_dir ./trt_engines/mixtral/tp2ep4 \ + --gemm_plugin float16 +``` + +See the +[doc](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/expert-parallelism.md) +to learn more about how TensorRT-LLM expert parallelism works in Mixture of Experts (MoE). + +### MIG Support + +See the +[MIG tutorial](https://github.com/triton-inference-server/tutorials/tree/main/Deployment/Kubernetes) +for more details on how to run TRT-LLM models and Triton with MIG. + +### Scheduling + +The scheduler policy helps the batch manager adjust how requests are scheduled +for execution. There are two scheduler policies supported in TensorRT-LLM, +`MAX_UTILIZATION` and `GUARANTEED_NO_EVICT`. See the +[batch manager design](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/batch-manager.md#gptmanager-design) +to learn more about how scheduler policies work. You can specify the scheduler +policy via the `batch_scheduler_policy` parameter in the +[model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model. + +### Key-Value Cache + +See the +[KV Cache](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-attention.md#kv-cache) +section for more details on how TensorRT-LLM supports KV cache. Also, check out +the [KV Cache Reuse](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/kv_cache_reuse.md) +documentation to learn more about how to enable KV cache reuse when building the +TRT-LLM engine. Parameters for KV cache can be found in the +[model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model. + +### Decoding + +#### Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search, Medusa, ReDrafter, Lookahead and Eagle + +TensorRT-LLM supports various decoding modes, including top-k, top-p, +top-k top-p, beam search Medusa, ReDrafter, Lookahead and Eagle. See the +[Sampling Parameters](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-runtime.md#sampling-parameters) +section to learn more about top-k, top-p, top-k top-p and beam search decoding. +Please refer to the +[speculative decoding documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/speculative-decoding.md) +for more details on Medusa, ReDrafter, Lookahead and Eagle. + +Parameters for decoding modes can be found in the +[model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model. + +#### Speculative Decoding + +See the +[Speculative Decoding](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/speculative_decoding.md) +documentation to learn more about how TensorRT-LLM supports speculative decoding +to improve the performance. The parameters for speculative decoding can be found +in the [model config](./docs/model_config.md#tensorrt_llm_bls_model) of +tensorrt_llm_bls model. + +### Chunked Context + +For more details on how to use chunked context, please refer to the +[Chunked Context](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-attention.md#chunked-context) +section. Parameters for chunked context can be found in the +[model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model. + +### Quantization + +Check out the +[Quantization Guide](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md) +to learn more about how to install the quantization toolkit and quantize +TensorRT-LLM models. Also, check out the blog post +[Speed up inference with SOTA quantization techniques in TRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/quantization-in-TRT-LLM.md) +to learn more about how to speed up inference with quantization. + +### LoRa + +Refer to [lora.md](./docs/lora.md) for more details on how to use LoRa +with TensorRT-LLM and Triton. + +## Launch Triton server *within Slurm based clusters* + +### Prepare some scripts `tensorrt_llm_triton.sub` ```bash #!/bin/bash #SBATCH -o logs/tensorrt_llm.out #SBATCH -e logs/tensorrt_llm.error -#SBATCH -J gpu-comparch-ftp:mgmn -#SBATCH -A gpu-comparch -#SBATCH -p luna +#SBATCH -J +#SBATCH -A +#SBATCH -p #SBATCH --nodes=1 #SBATCH --ntasks-per-node=8 #SBATCH --time=00:30:00 sudo nvidia-smi -lgc 1410,1410 -srun --mpi=pmix --container-image \ - --container-mounts : \ - --container-workdir \ +srun --mpi=pmix \ + --container-image triton_trt_llm \ + --container-workdir /tensorrtllm_backend \ --output logs/tensorrt_llm_%t.out \ - bash /tensorrt_llm_triton.sh + bash /tensorrtllm_backend/tensorrt_llm_triton.sh ``` `tensorrt_llm_triton.sh` -``` +```bash TRITONSERVER="/opt/tritonserver/bin/tritonserver" -MODEL_REPO="/triton_backend/" +MODEL_REPO="/triton_model_repo" ${TRITONSERVER} --model-repository=${MODEL_REPO} --disable-auto-complete-config --backend-config=python,shm-region-prefix-name=prefix${SLURM_PROCID}_ ``` -2. Submit a Slurm job +If srun initializes the mpi environment, you can use the following command to launch the Triton server: + +```bash +srun --mpi pmix launch_triton_server.py --oversubscribe ``` + +### Submit a Slurm job + +```bash sbatch tensorrt_llm_triton.sub ``` -When successfully deployed, the server produces logs similar to the following ones. -``` -I0919 14:52:10.475738 293 grpc_server.cc:2451] Started GRPCInferenceService at 0.0.0.0:8001 -I0919 14:52:10.475968 293 http_server.cc:3558] Started HTTPService at 0.0.0.0:8000 -I0919 14:52:10.517138 293 http_server.cc:187] Started Metrics Service at 0.0.0.0:8002 +You might have to contact your cluster's administrator to help you customize the above script. + +## Triton Metrics + +Starting with the 23.11 release of Triton, users can now obtain TRT LLM Batch +Manager [statistics](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/batch-manager.md#statistics) +by querying the Triton metrics endpoint. This can be accomplished by launching +a Triton server in any of the ways described above (ensuring the build code / +container is 23.11 or later) and querying the server. Upon receiving a +successful response, you can query the metrics endpoint by entering the +following: + +```bash +curl localhost:8002/metrics ``` -### Kill the backend +Batch manager statistics are reported by the metrics endpoint in fields that +are prefixed with `nv_trt_llm_`. Your output for these fields should look +similar to the following (assuming your model is an inflight batcher model): ```bash -pgrep tritonserver | xargs kill -9 +# HELP nv_trt_llm_request_metrics TRT LLM request metrics +# TYPE nv_trt_llm_request_metrics gauge +nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="waiting",version="1"} 1 +nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="context",version="1"} 1 +nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="scheduled",version="1"} 1 +nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="max",version="1"} 512 +nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="active",version="1"} 0 +# HELP nv_trt_llm_runtime_memory_metrics TRT LLM runtime memory metrics +# TYPE nv_trt_llm_runtime_memory_metrics gauge +nv_trt_llm_runtime_memory_metrics{memory_type="pinned",model="tensorrt_llm",version="1"} 0 +nv_trt_llm_runtime_memory_metrics{memory_type="gpu",model="tensorrt_llm",version="1"} 1610236 +nv_trt_llm_runtime_memory_metrics{memory_type="cpu",model="tensorrt_llm",version="1"} 0 +# HELP nv_trt_llm_kv_cache_block_metrics TRT LLM KV cache block metrics +# TYPE nv_trt_llm_kv_cache_block_metrics gauge +nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="fraction",model="tensorrt_llm",version="1"} 0.4875 +nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="tokens_per",model="tensorrt_llm",version="1"} 64 +nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="used",model="tensorrt_llm",version="1"} 1 +nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="free",model="tensorrt_llm",version="1"} 6239 +nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="max",model="tensorrt_llm",version="1"} 6239 +# HELP nv_trt_llm_inflight_batcher_metrics TRT LLM inflight_batcher-specific metrics +# TYPE nv_trt_llm_inflight_batcher_metrics gauge +nv_trt_llm_inflight_batcher_metrics{inflight_batcher_specific_metric="micro_batch_id",model="tensorrt_llm",version="1"} 0 +nv_trt_llm_inflight_batcher_metrics{inflight_batcher_specific_metric="generation_requests",model="tensorrt_llm",version="1"} 0 +nv_trt_llm_inflight_batcher_metrics{inflight_batcher_specific_metric="total_context_tokens",model="tensorrt_llm",version="1"} 0 +# HELP nv_trt_llm_general_metrics General TRT LLM metrics +# TYPE nv_trt_llm_general_metrics gauge +nv_trt_llm_general_metrics{general_type="iteration_counter",model="tensorrt_llm",version="1"} 0 +nv_trt_llm_general_metrics{general_type="timestamp",model="tensorrt_llm",version="1"} 1700074049 +# HELP nv_trt_llm_disaggregated_serving_metrics TRT LLM disaggregated serving metrics +# TYPE nv_trt_llm_disaggregated_serving_metrics counter +nv_trt_llm_disaggregated_serving_metrics{disaggregated_serving_type="kv_cache_transfer_ms",model="tensorrt_llm",version="1"} 0 +nv_trt_llm_disaggregated_serving_metrics{disaggregated_serving_type="request_count",model="tensorrt_llm",version="1"} 0 ``` -## C++ backend examples (support inflight batching) -Please follow the guide in [`inflight_batcher_llm/README.md`](inflight_batcher_llm/README.md). +If, instead, you launched a V1 model, your output will look similar to the +output above except the inflight batcher related fields will be replaced +with something similar to the following: -## Python backend examples (not support inflight batching) +```bash +# HELP nv_trt_llm_v1_metrics TRT LLM v1-specific metrics +# TYPE nv_trt_llm_v1_metrics gauge +nv_trt_llm_v1_metrics{model="tensorrt_llm",v1_specific_metric="total_generation_tokens",version="1"} 20 +nv_trt_llm_v1_metrics{model="tensorrt_llm",v1_specific_metric="empty_generation_slots",version="1"} 0 +nv_trt_llm_v1_metrics{model="tensorrt_llm",v1_specific_metric="total_context_tokens",version="1"} 5 +``` + +Please note that versions of Triton prior to the 23.12 release do not +support base Triton metrics. As such, the following fields will report 0: -### GPT ```bash -cd tools/gpt/ +# HELP nv_inference_request_success Number of successful inference requests, all batch sizes +# TYPE nv_inference_request_success counter +nv_inference_request_success{model="tensorrt_llm",version="1"} 0 +# HELP nv_inference_request_failure Number of failed inference requests, all batch sizes +# TYPE nv_inference_request_failure counter +nv_inference_request_failure{model="tensorrt_llm",version="1"} 0 +# HELP nv_inference_count Number of inferences performed (does not include cached requests) +# TYPE nv_inference_count counter +nv_inference_count{model="tensorrt_llm",version="1"} 0 +# HELP nv_inference_exec_count Number of model executions performed (does not include cached requests) +# TYPE nv_inference_exec_count counter +nv_inference_exec_count{model="tensorrt_llm",version="1"} 0 +# HELP nv_inference_request_duration_us Cumulative inference request duration in microseconds (includes cached requests) +# TYPE nv_inference_request_duration_us counter +nv_inference_request_duration_us{model="tensorrt_llm",version="1"} 0 +# HELP nv_inference_queue_duration_us Cumulative inference queuing duration in microseconds (includes cached requests) +# TYPE nv_inference_queue_duration_us counter +nv_inference_queue_duration_us{model="tensorrt_llm",version="1"} 0 +# HELP nv_inference_compute_input_duration_us Cumulative compute input duration in microseconds (does not include cached requests) +# TYPE nv_inference_compute_input_duration_us counter +nv_inference_compute_input_duration_us{model="tensorrt_llm",version="1"} 0 +# HELP nv_inference_compute_infer_duration_us Cumulative compute inference duration in microseconds (does not include cached requests) +# TYPE nv_inference_compute_infer_duration_us counter +nv_inference_compute_infer_duration_us{model="tensorrt_llm",version="1"} 0 +# HELP nv_inference_compute_output_duration_us Cumulative inference compute output duration in microseconds (does not include cached requests) +# TYPE nv_inference_compute_output_duration_us counter +nv_inference_compute_output_duration_us{model="tensorrt_llm",version="1"} 0 +# HELP nv_inference_pending_request_count Instantaneous number of pending requests awaiting execution per-model. +# TYPE nv_inference_pending_request_count gauge +nv_inference_pending_request_count{model="tensorrt_llm",version="1"} 0 +``` + +## Benchmarking + +Check out [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) +tool for benchmarking TensorRT-LLM models. -rm -rf gpt2 && git clone https://huggingface.co/gpt2 -pushd gpt2 && rm pytorch_model.bin model.safetensors && \ - wget -q https://huggingface.co/gpt2/resolve/main/pytorch_model.bin && popd +You can also use the +[benchmark_core_model script](./tools/inflight_batcher_llm/benchmark_core_model.py) +to benchmark the core model `tensosrrt_llm`. The script sends requests directly +to deployed `tensorrt_llm` model. The benchmark core model latency indicates the +inference latency of TensorRT-LLM, not including the pre/post-processing latency +which is usually handled by a third-party library such as HuggingFace. -python3 client.py \ - --text="Born in north-east France, Soyer trained as a" \ - --output_len=10 \ - --tokenizer_dir gpt2 \ - --tokenizer_type auto +benchmark_core_model can generate traffic from 2 sources. +1 - dataset (json file containing prompts and optional responses) +2 - token normal distribution (user specified input, output seqlen) -# Exmaple output: -# [INFO] Latency: 92.278 ms -# Input: Born in north-east France, Soyer trained as a -# Output: chef and a cook at the local restaurant, La +By default, exponential distrution is used to control arrival rate of requests. +It can be changed to constant arrival time. + +```bash +cd tools/inflight_batcher_llm ``` -*Please note that the example outputs are only for reference, specific performance numbers depend on the GPU you're using.* -## Test +Example: Run dataset with 10 req/sec requested rate with provided tokenizer. ```bash -cd tools/gpt/ +python3 benchmark_core_model.py -i grpc --request_rate 10 dataset --dataset --tokenizer_dir <> --num_requests 5000 +``` -# Identity test -python3 identity_test.py \ - --batch_size=8 --start_len=128 --output_len=20 -# Results: -# [INFO] Batch size: 8, Start len: 8, Output len: 10 -# [INFO] Latency: 70.782 ms -# [INFO] Throughput: 113.023 sentences / sec +Example: Generate I/O seqlen tokens with input normal distribution with mean_seqlen=128, stdev=10. Output normal distribution with mean_seqlen=20, stdev=2. Set stdev=0 to get constant seqlens. -# Benchmark using Perf Analyzer -python3 gen_input_data.py -perf_analyzer -m tensorrt_llm \ - -b 8 --input-data input_data.json \ - --concurrency-range 1:10:2 \ - -u 'localhost:8000' +```bash +python3 benchmark_core_model.py -i grpc --request_rate 10 token_norm_dist --input_mean 128 --input_stdev 5 --output_mean 20 --output_stdev 2 --num_requests 5000 +``` + +Expected outputs + +```bash +[INFO] Warm up for benchmarking. +[INFO] Start benchmarking on 5000 prompts. +[INFO] Total Latency: 26585.349 ms +[INFO] Total request latencies: 11569672.000999955 ms ++----------------------------+----------+ +| Stat | Value | ++----------------------------+----------+ +| Requests/Sec | 188.09 | +| OP tokens/sec | 3857.66 | +| Avg. latency (ms) | 2313.93 | +| P99 latency (ms) | 3624.95 | +| P90 latency (ms) | 3127.75 | +| Avg. IP tokens per request | 128.53 | +| Avg. OP tokens per request | 20.51 | +| Total latency (ms) | 26582.72 | +| Total requests | 5000.00 | ++----------------------------+----------+ -# Results: -# Concurrency: 1, throughput: 99.9875 infer/sec, latency 79797 usec -# Concurrency: 3, throughput: 197.308 infer/sec, latency 121342 usec -# Concurrency: 5, throughput: 259.077 infer/sec, latency 153693 usec -# Concurrency: 7, throughput: 286.18 infer/sec, latency 195011 usec -# Concurrency: 9, throughput: 307.067 infer/sec, latency 233354 usec ``` -*Please note that the example outputs are only for reference, specific performance numbers depend on the GPU you're using.* +*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.* + +## Testing the TensorRT-LLM Backend +Please follow the guide in [`ci/README.md`](ci/README.md) to see how to run +the testing for TensorRT-LLM backend. diff --git a/all_models/gpt/ensemble/1/.tmp b/all_models/gpt/ensemble/1/.tmp deleted file mode 100644 index e69de29b..00000000 diff --git a/all_models/gpt/ensemble/config.pbtxt b/all_models/gpt/ensemble/config.pbtxt deleted file mode 100755 index fe85954e..00000000 --- a/all_models/gpt/ensemble/config.pbtxt +++ /dev/null @@ -1,220 +0,0 @@ -name: "ensemble" -platform: "ensemble" -max_batch_size: 1024 -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "max_tokens" - data_type: TYPE_UINT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "end_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "beam_width" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "output_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1, -1 ] - } -] -ensemble_scheduling { - step [ - { - model_name: "preprocessing" - model_version: -1 - input_map { - key: "QUERY" - value: "text_input" - } - input_map { - key: "REQUEST_OUTPUT_LEN" - value: "max_tokens" - } - input_map { - key: "BAD_WORDS_DICT" - value: "bad_words" - } - input_map { - key: "STOP_WORDS_DICT" - value: "stop_words" - } - output_map { - key: "REQUEST_INPUT_LEN" - value: "_REQUEST_INPUT_LEN" - } - output_map { - key: "INPUT_ID" - value: "_INPUT_ID" - } - output_map { - key: "REQUEST_OUTPUT_LEN" - value: "_REQUEST_OUTPUT_LEN" - } - }, - { - model_name: "tensorrt_llm" - model_version: -1 - input_map { - key: "input_ids" - value: "_INPUT_ID" - } - input_map { - key: "input_lengths" - value: "_REQUEST_INPUT_LEN" - } - input_map { - key: "request_output_len" - value: "_REQUEST_OUTPUT_LEN" - } - input_map { - key: "end_id" - value: "end_id" - } - input_map { - key: "pad_id" - value: "pad_id" - } - input_map { - key: "runtime_top_k" - value: "top_k" - } - input_map { - key: "runtime_top_p" - value: "top_p" - } - input_map { - key: "temperature" - value: "temperature" - } - input_map { - key: "len_penalty" - value: "length_penalty" - } - input_map { - key: "repetition_penalty" - value: "repetition_penalty" - } - input_map { - key: "min_length" - value: "min_length" - } - input_map { - key: "presence_penalty" - value: "presence_penalty" - } - input_map { - key: "random_seed" - value: "random_seed" - } - input_map { - key: "beam_width" - value: "beam_width" - } - input_map { - key: "output_log_probs" - value: "output_log_probs" - } - output_map { - key: "output_ids" - value: "_TOKENS_BATCH" - } - }, - { - model_name: "postprocessing" - model_version: -1 - input_map { - key: "TOKENS_BATCH" - value: "_TOKENS_BATCH" - } - output_map { - key: "OUTPUT" - value: "text_output" - } - } - ] -} diff --git a/all_models/gpt/postprocessing/1/model.py b/all_models/gpt/postprocessing/1/model.py deleted file mode 100644 index 032c12ae..00000000 --- a/all_models/gpt/postprocessing/1/model.py +++ /dev/null @@ -1,128 +0,0 @@ -# -*- coding: utf-8 -*- -import json - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - tokenizer_type = model_config['parameters']['tokenizer_type'][ - 'string_value'] - - if tokenizer_type == 't5': - self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'auto': - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'llama': - self.tokenizer = LlamaTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side='left') - else: - raise AttributeError( - f'Unexpected tokenizer type: {tokenizer_type}') - self.tokenizer.pad_token = self.tokenizer.eos_token - - # Parse model output configs - output_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT") - - # Convert Triton types to numpy types - self.output_dtype = pb_utils.triton_string_to_numpy( - output_config['data_type']) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - tokens_batch = pb_utils.get_input_tensor_by_name( - request, 'TOKENS_BATCH').as_numpy() - - # Reshape Input - # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) - # tokens_batch = tokens_batch.T - - # Postprocessing output data. - outputs = self._postprocessing(tokens_batch) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( - 'OUTPUT', - np.array(outputs).astype(self.output_dtype)) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse( - output_tensors=[output_tensor]) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') - - def _postprocessing(self, tokens_batch): - outputs = [] - for beam_tokens in tokens_batch: - for tokens in beam_tokens: - output = self.tokenizer.decode(tokens) - outputs.append(output.encode('utf8')) - return outputs diff --git a/all_models/gpt/postprocessing/config.pbtxt b/all_models/gpt/postprocessing/config.pbtxt deleted file mode 100755 index 64908b5f..00000000 --- a/all_models/gpt/postprocessing/config.pbtxt +++ /dev/null @@ -1,38 +0,0 @@ -name: "postprocessing" -backend: "python" -max_batch_size: 1024 -input [ - { - name: "TOKENS_BATCH" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - } -] -output [ - { - name: "OUTPUT" - data_type: TYPE_STRING - dims: [ -1, -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "${tokenizer_dir}" - } -} - -parameters { - key: "tokenizer_type" - value: { - string_value: "${tokenizer_type}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/all_models/gpt/preprocessing/1/model.py b/all_models/gpt/preprocessing/1/model.py deleted file mode 100644 index ad185c14..00000000 --- a/all_models/gpt/preprocessing/1/model.py +++ /dev/null @@ -1,210 +0,0 @@ -# -*- coding: utf-8 -*- -import csv -import json -from typing import List - -import numpy as np -import torch -import triton_python_backend_utils as pb_utils -from torch.nn.utils.rnn import pad_sequence -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - tokenizer_type = model_config['parameters']['tokenizer_type'][ - 'string_value'] - - if tokenizer_type == 't5': - self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'auto': - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'llama': - self.tokenizer = LlamaTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side='left') - else: - raise AttributeError( - f'Unexpected tokenizer type: {tokenizer_type}') - self.tokenizer.pad_token = self.tokenizer.eos_token - - self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token, - add_special_tokens=False)[0] - - # Parse model output configs and convert Triton types to numpy types - input_names = [ - "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS" - ] - for input_name in input_names: - setattr( - self, - input_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_output_config_by_name( - model_config, input_name)['data_type'])) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - query = pb_utils.get_input_tensor_by_name(request, - 'QUERY').as_numpy() - request_output_len = pb_utils.get_input_tensor_by_name( - request, 'REQUEST_OUTPUT_LEN').as_numpy() - - bad_words_dict = pb_utils.get_input_tensor_by_name( - request, 'BAD_WORDS_DICT').as_numpy() - stop_words_dict = pb_utils.get_input_tensor_by_name( - request, 'STOP_WORDS_DICT').as_numpy() - - # Preprocessing input data. - input_id, request_input_len = self._create_request(query) - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - input_id_tensor = pb_utils.Tensor( - 'INPUT_ID', - np.array(input_id).astype(self.input_id_dtype)) - request_input_len_tensor = pb_utils.Tensor( - 'REQUEST_INPUT_LEN', - np.array(request_input_len).astype( - self.request_input_len_dtype)) - request_output_len_tensor = pb_utils.Tensor( - 'REQUEST_OUTPUT_LEN', request_output_len) - bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words) - stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS', - stop_words) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_output_len_tensor - ]) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') - - def _create_request(self, query): - """ - query : batch string (2D numpy array) - """ - start_ids = [ - torch.IntTensor(self.tokenizer.encode(s[0].decode())) - for s in query - ] - start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids]) - - start_ids = pad_sequence(start_ids, - batch_first=True, - padding_value=self.pad_id) - # input_len = min(start_lengths) - #attn_mask = torch.ones((batch_size, input_len, input_len)).tril() - - return start_ids, start_lengths - - def _to_word_list_format(self, word_dict: List[List[str]]): - ''' - format of word_dict - len(word_dict) should be same to batch_size - word_dict[i] means the words for batch i - len(word_dict[i]) must be 1, which means it only contains 1 string - This string can contains several sentences and split by ",". - For example, if word_dict[2] = " I am happy, I am sad", then this function will return - the ids for two short sentences " I am happy" and " I am sad". - ''' - assert self.tokenizer != None, "need to set tokenizer" - - flat_ids = [] - offsets = [] - for word_dict_item in word_dict: - item_flat_ids = [] - item_offsets = [] - - if isinstance(word_dict_item[0], bytes): - word_dict_item = [word_dict_item[0].decode()] - - words = list(csv.reader(word_dict_item))[0] - for word in words: - ids = self.tokenizer.encode(word) - - if len(ids) == 0: - continue - - item_flat_ids += ids - item_offsets.append(len(ids)) - - flat_ids.append(np.array(item_flat_ids)) - offsets.append(np.cumsum(np.array(item_offsets))) - - pad_to = max(1, max(len(ids) for ids in flat_ids)) - - for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): - flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), - constant_values=0) - offsets[i] = np.pad(offs, (0, pad_to - len(offs)), - constant_values=-1) - - return np.array([flat_ids, offsets], dtype="int32").transpose( - (1, 0, 2)) diff --git a/all_models/gpt/preprocessing/config.pbtxt b/all_models/gpt/preprocessing/config.pbtxt deleted file mode 100644 index 99f14c31..00000000 --- a/all_models/gpt/preprocessing/config.pbtxt +++ /dev/null @@ -1,78 +0,0 @@ -name: "preprocessing" -backend: "python" -max_batch_size: 1024 -input [ - { - name: "QUERY" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "BAD_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "STOP_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_UINT32 - dims: [ -1 ] - } -] -output [ - { - name: "INPUT_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "REQUEST_INPUT_LEN" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "BAD_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "STOP_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_UINT32 - dims: [ -1 ] - }, - { - name: "PROMPT_LEARNING_TASK_NAME_IDS" - data_type: TYPE_UINT32 - dims: [ 1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "${tokenizer_dir}" - } -} - -parameters { - key: "tokenizer_type" - value: { - string_value: "${tokenizer_type}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/all_models/gpt/tensorrt_llm/1/model.py b/all_models/gpt/tensorrt_llm/1/model.py deleted file mode 100644 index 3d036efd..00000000 --- a/all_models/gpt/tensorrt_llm/1/model.py +++ /dev/null @@ -1,261 +0,0 @@ -import json -import os - -import torch -import triton_python_backend_utils as pb_utils -from torch import from_numpy - -import tensorrt_llm -from tensorrt_llm.runtime import GenerationSession, ModelConfig, SamplingConfig - - -def mpi_comm(): - from mpi4py import MPI - return MPI.COMM_WORLD - - -def mpi_rank(): - return mpi_comm().Get_rank() - - -def get_engine_name(model, dtype, tp_size, rank): - return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) - - -def get_input_tensor_by_name(request, name): - tensor = pb_utils.get_input_tensor_by_name(request, name) - if tensor is not None: - # Triton tensor -> numpy tensor -> PyTorch tensor - return from_numpy(tensor.as_numpy()) - else: - return tensor - - -def get_input_scalar_by_name(request, name): - tensor = pb_utils.get_input_tensor_by_name(request, name) - if tensor is not None: - # Triton tensor -> numpy tensor -> first scalar - tensor = tensor.as_numpy() - return tensor.reshape((tensor.size, ))[0] - else: - return tensor - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - model_config = json.loads(args['model_config']) - engine_dir = model_config['parameters']['engine_dir']['string_value'] - config_path = os.path.join(engine_dir, 'config.json') - with open(config_path, 'r') as f: - config = json.load(f) - use_gpt_attention_plugin = config['plugin_config'][ - 'gpt_attention_plugin'] - self.remove_input_padding = config['plugin_config'][ - 'remove_input_padding'] - model = config['builder_config']['name'] - dtype = config['builder_config']['precision'] - tensor_parallel = config['builder_config']['tensor_parallel'] - pipeline_parallel = 1 - if 'pipeline_parallel' in config['builder_config']: - pipeline_parallel = config['builder_config']['pipeline_parallel'] - world_size = tensor_parallel * pipeline_parallel - assert world_size == tensorrt_llm.mpi_world_size(), \ - f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' - num_heads = config['builder_config']['num_heads'] // world_size - hidden_size = config['builder_config']['hidden_size'] // world_size - vocab_size = config['builder_config']['vocab_size'] - num_layers = config['builder_config']['num_layers'] - num_kv_heads = num_heads - if "num_kv_heads" in config['builder_config'].keys(): - num_kv_heads = (config['builder_config']['num_kv_heads'] + - world_size - 1) // world_size - elif "multi_query_mode" in config['builder_config'].keys(): - num_kv_heads = 1 if config['builder_config'][ - 'multi_query_mode'] else num_heads - - self.comm = mpi_comm() - self.rank = mpi_rank() - - model_config = ModelConfig( - num_heads=num_heads, - num_kv_heads=num_kv_heads, - hidden_size=hidden_size, - vocab_size=vocab_size, - num_layers=num_layers, - gpt_attention_plugin=use_gpt_attention_plugin, - remove_input_padding=self.remove_input_padding) - engine_name = get_engine_name(model, dtype, world_size, self.rank) - serialize_path = os.path.join(engine_dir, engine_name) - with open(serialize_path, 'rb') as f: - engine_buffer = f.read() - runtime_mapping = tensorrt_llm.Mapping(world_size=world_size, - rank=self.rank, - gpus_per_node=8, - tp_size=tensor_parallel, - pp_size=pipeline_parallel) - torch.cuda.set_device(self.rank % runtime_mapping.gpus_per_node) - self.decoder = GenerationSession(model_config, engine_buffer, - runtime_mapping) - - if self.rank != 0: - while (True): - self.execute([None]) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - responses = [] - - # Every Python backend must iterate through list of requests and create - # an instance of pb_utils.InferenceResponse class for each of them. You - # should avoid storing any of the input Tensors in the class attributes - # as they will be overridden in subsequent inference requests. You can - # make a copy of the underlying NumPy array and store it if it is - # required. - for request in requests: - # Perform inference on the request and append it to responses list... - inputs = {} - if self.rank == 0: - inputs['input_ids'] = get_input_tensor_by_name( - request, 'input_ids') - inputs['input_lengths'] = get_input_tensor_by_name( - request, 'input_lengths') - inputs['request_output_len'] = get_input_scalar_by_name( - request, 'request_output_len') - inputs['end_id'] = get_input_scalar_by_name(request, 'end_id') - inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id') - inputs['beam_width'] = get_input_scalar_by_name( - request, 'beam_width') - inputs['temperature'] = get_input_scalar_by_name( - request, 'temperature') - inputs['runtime_top_k'] = get_input_scalar_by_name( - request, 'runtime_top_k') - inputs['runtime_top_p'] = get_input_scalar_by_name( - request, 'runtime_top_p') - inputs['len_penalty'] = get_input_scalar_by_name( - request, 'len_penalty') - inputs['repetition_penalty'] = get_input_scalar_by_name( - request, 'repetition_penalty') - inputs['min_length'] = get_input_scalar_by_name( - request, 'min_length') - inputs['presence_penalty'] = get_input_scalar_by_name( - request, 'presence_penalty') - inputs['random_seed'] = get_input_scalar_by_name( - request, 'random_seed') - inputs['output_log_probs'] = get_input_scalar_by_name( - request, 'output_log_probs') - - # Broadcast requests to other clients - inputs = self.comm.bcast(inputs, root=0) - input_ids = inputs['input_ids'].cuda() - input_lengths = inputs['input_lengths'].cuda() - end_id = inputs['end_id'] - pad_id = inputs['pad_id'] - - sampling_config = SamplingConfig(end_id=end_id, pad_id=pad_id) - if inputs['beam_width'] is not None: - sampling_config.num_beams = inputs['beam_width'] - if inputs['temperature'] is not None: - sampling_config.temperature = inputs['temperature'] - if inputs['runtime_top_k'] is not None: - sampling_config.top_k = inputs['runtime_top_k'] - if inputs['runtime_top_p'] is not None: - sampling_config.top_p = inputs['runtime_top_p'] - if inputs['len_penalty'] is not None: - sampling_config.length_penalty = inputs['len_penalty'] - if inputs['repetition_penalty'] is not None: - sampling_config.repetition_penalty = inputs[ - 'repetition_penalty'] - if inputs['min_length'] is not None: - sampling_config.min_length = inputs['min_length'] - if inputs['presence_penalty'] is not None: - sampling_config.presence_penalty = inputs['presence_penalty'] - sampling_config.random_seed = inputs['random_seed'] - sampling_config.output_log_probs = inputs['output_log_probs'] - if self.remove_input_padding: - self.decoder.setup( - batch_size=input_ids.size(0), - max_context_length=torch.max(input_lengths).item(), - max_new_tokens=inputs['request_output_len']) - else: - self.decoder.setup(input_ids.size(0), input_ids.size(1), - inputs['request_output_len']) - if self.remove_input_padding: - output_ids = self.decoder.decode_batch(input_ids, - sampling_config) - else: - output_ids = self.decoder.decode(input_ids, input_lengths, - sampling_config) - - if self.rank == 0: - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - torch.cuda.synchronize() - output_tensors = [ - pb_utils.Tensor("output_ids", - output_ids.cpu().numpy()) - ] - - if sampling_config.output_log_probs: - # [max_new_tokens, batch_size, num_beams] -> [batch_size, max_new_tokens, num_beams] - log_probs = self.decoder.log_probs.transpose( - 0, 1).cpu().numpy() - output_tensors.append( - pb_utils.Tensor("log_probs", log_probs)) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) - - inference_response = pb_utils.InferenceResponse(output_tensors) - else: - inference_response = pb_utils.InferenceResponse([]) - responses.append(inference_response) - - # You must return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - return diff --git a/all_models/gpt/tensorrt_llm/config.pbtxt b/all_models/gpt/tensorrt_llm/config.pbtxt deleted file mode 100644 index 15ab5100..00000000 --- a/all_models/gpt/tensorrt_llm/config.pbtxt +++ /dev/null @@ -1,139 +0,0 @@ -name: "tensorrt_llm" -backend: "python" -max_batch_size: 1024 - -# # Uncomment this for dynamic_batching -# dynamic_batching { -# max_queue_delay_microseconds: 50000 -# } - -input [ - { - name: "input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "request_output_len" - data_type: TYPE_UINT32 - dims: [ -1 ] - }, - { - name: "end_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "pad_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "beam_width" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_k" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "len_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "min_length" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "output_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - } -] -output [ - { - name: "output_ids" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - }, - { - name: "log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - } -] -instance_group [ - { - count: 1 - kind : KIND_CPU - } -] -parameters { - key: "engine_dir" - value: { - string_value: "${engine_dir}" - } -} -parameters: { - key: "FORCE_CPU_ONLY_INPUT_TENSORS" - value: { - string_value: "no" - } -} diff --git a/all_models/inflight_batcher_llm/ensemble/1/.tmp b/all_models/inflight_batcher_llm/ensemble/1/.tmp deleted file mode 100644 index e69de29b..00000000 diff --git a/all_models/inflight_batcher_llm/ensemble/config.pbtxt b/all_models/inflight_batcher_llm/ensemble/config.pbtxt deleted file mode 100755 index 01131a7e..00000000 --- a/all_models/inflight_batcher_llm/ensemble/config.pbtxt +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "ensemble" -platform: "ensemble" -max_batch_size: 128 -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "max_tokens" - data_type: TYPE_UINT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "end_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "beam_width" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1, -1 ] - } -] -ensemble_scheduling { - step [ - { - model_name: "preprocessing" - model_version: -1 - input_map { - key: "QUERY" - value: "text_input" - } - input_map { - key: "REQUEST_OUTPUT_LEN" - value: "max_tokens" - } - input_map { - key: "BAD_WORDS_DICT" - value: "bad_words" - } - input_map { - key: "STOP_WORDS_DICT" - value: "stop_words" - } - output_map { - key: "REQUEST_INPUT_LEN" - value: "_REQUEST_INPUT_LEN" - } - output_map { - key: "INPUT_ID" - value: "_INPUT_ID" - } - output_map { - key: "REQUEST_OUTPUT_LEN" - value: "_REQUEST_OUTPUT_LEN" - } - }, - { - model_name: "tensorrt_llm" - model_version: -1 - input_map { - key: "input_ids" - value: "_INPUT_ID" - } - input_map { - key: "input_lengths" - value: "_REQUEST_INPUT_LEN" - } - input_map { - key: "request_output_len" - value: "_REQUEST_OUTPUT_LEN" - } - input_map { - key: "end_id" - value: "end_id" - } - input_map { - key: "pad_id" - value: "pad_id" - } - input_map { - key: "runtime_top_k" - value: "top_k" - } - input_map { - key: "runtime_top_p" - value: "top_p" - } - input_map { - key: "temperature" - value: "temperature" - } - input_map { - key: "len_penalty" - value: "length_penalty" - } - input_map { - key: "repetition_penalty" - value: "repetition_penalty" - } - input_map { - key: "min_length" - value: "min_length" - } - input_map { - key: "presence_penalty" - value: "presence_penalty" - } - input_map { - key: "random_seed" - value: "random_seed" - } - input_map { - key: "beam_width" - value: "beam_width" - } - input_map { - key: "streaming" - value: "stream" - } - output_map { - key: "output_ids" - value: "_TOKENS_BATCH" - } - }, - { - model_name: "postprocessing" - model_version: -1 - input_map { - key: "TOKENS_BATCH" - value: "_TOKENS_BATCH" - } - output_map { - key: "OUTPUT" - value: "text_output" - } - } - ] -} diff --git a/all_models/inflight_batcher_llm/postprocessing/1/model.py b/all_models/inflight_batcher_llm/postprocessing/1/model.py deleted file mode 100644 index 183241ff..00000000 --- a/all_models/inflight_batcher_llm/postprocessing/1/model.py +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - tokenizer_type = model_config['parameters']['tokenizer_type'][ - 'string_value'] - - if tokenizer_type == 't5': - self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'auto': - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'llama': - self.tokenizer = LlamaTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side='left') - else: - raise AttributeError( - f'Unexpected tokenizer type: {tokenizer_type}') - self.tokenizer.pad_token = self.tokenizer.eos_token - - # Parse model output configs - output_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT") - - # Convert Triton types to numpy types - self.output_dtype = pb_utils.triton_string_to_numpy( - output_config['data_type']) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - tokens_batch = pb_utils.get_input_tensor_by_name( - request, 'TOKENS_BATCH').as_numpy() - - # Reshape Input - # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) - # tokens_batch = tokens_batch.T - - # Postprocessing output data. - outputs = self._postprocessing(tokens_batch) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( - 'OUTPUT', - np.array(outputs).astype(self.output_dtype)) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse( - output_tensors=[output_tensor]) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') - - def _postprocessing(self, tokens_batch): - outputs = [] - for beam_tokens in tokens_batch: - for tokens in beam_tokens: - output = self.tokenizer.decode(tokens) - outputs.append(output.encode('utf8')) - return outputs diff --git a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt b/all_models/inflight_batcher_llm/postprocessing/config.pbtxt deleted file mode 100755 index 0ac07865..00000000 --- a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "postprocessing" -backend: "python" -max_batch_size: 128 -input [ - { - name: "TOKENS_BATCH" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - } -] -output [ - { - name: "OUTPUT" - data_type: TYPE_STRING - dims: [ -1, -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "${tokenizer_dir}" - } -} - -parameters { - key: "tokenizer_type" - value: { - string_value: "${tokenizer_type}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/all_models/inflight_batcher_llm/preprocessing/1/model.py b/all_models/inflight_batcher_llm/preprocessing/1/model.py deleted file mode 100644 index 1d6b757e..00000000 --- a/all_models/inflight_batcher_llm/preprocessing/1/model.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import csv -import json -from typing import List - -import numpy as np -import torch -import triton_python_backend_utils as pb_utils -from torch.nn.utils.rnn import pad_sequence -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - tokenizer_type = model_config['parameters']['tokenizer_type'][ - 'string_value'] - - if tokenizer_type == 't5': - self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'auto': - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - padding_side='left') - elif tokenizer_type == 'llama': - self.tokenizer = LlamaTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side='left') - else: - raise AttributeError( - f'Unexpected tokenizer type: {tokenizer_type}') - self.tokenizer.pad_token = self.tokenizer.eos_token - - self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token, - add_special_tokens=False)[0] - - # Parse model output configs and convert Triton types to numpy types - input_names = [ - "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS" - ] - for input_name in input_names: - setattr( - self, - input_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_output_config_by_name( - model_config, input_name)['data_type'])) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - query = pb_utils.get_input_tensor_by_name(request, - 'QUERY').as_numpy() - request_output_len = pb_utils.get_input_tensor_by_name( - request, 'REQUEST_OUTPUT_LEN').as_numpy() - - bad_words_dict = pb_utils.get_input_tensor_by_name( - request, 'BAD_WORDS_DICT').as_numpy() - stop_words_dict = pb_utils.get_input_tensor_by_name( - request, 'STOP_WORDS_DICT').as_numpy() - - # Preprocessing input data. - input_id, request_input_len = self._create_request(query) - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - input_id_tensor = pb_utils.Tensor( - 'INPUT_ID', - np.array(input_id).astype(self.input_id_dtype)) - request_input_len_tensor = pb_utils.Tensor( - 'REQUEST_INPUT_LEN', - np.array(request_input_len).astype( - self.request_input_len_dtype)) - request_output_len_tensor = pb_utils.Tensor( - 'REQUEST_OUTPUT_LEN', request_output_len) - bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words) - stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS', - stop_words) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_output_len_tensor - ]) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') - - def _create_request(self, query): - """ - query : batch string (2D numpy array) - """ - start_ids = [ - torch.IntTensor(self.tokenizer.encode(s[0].decode())) - for s in query - ] - start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids]) - - start_ids = pad_sequence(start_ids, - batch_first=True, - padding_value=self.pad_id) - # input_len = min(start_lengths) - #attn_mask = torch.ones((batch_size, input_len, input_len)).tril() - - return start_ids, start_lengths - - def _to_word_list_format(self, word_dict: List[List[str]]): - ''' - format of word_dict - len(word_dict) should be same to batch_size - word_dict[i] means the words for batch i - len(word_dict[i]) must be 1, which means it only contains 1 string - This string can contains several sentences and split by ",". - For example, if word_dict[2] = " I am happy, I am sad", then this function will return - the ids for two short sentences " I am happy" and " I am sad". - ''' - assert self.tokenizer != None, "need to set tokenizer" - - flat_ids = [] - offsets = [] - for word_dict_item in word_dict: - item_flat_ids = [] - item_offsets = [] - - if isinstance(word_dict_item[0], bytes): - word_dict_item = [word_dict_item[0].decode()] - - words = list(csv.reader(word_dict_item))[0] - for word in words: - ids = self.tokenizer.encode(word) - - if len(ids) == 0: - continue - - item_flat_ids += ids - item_offsets.append(len(ids)) - - flat_ids.append(np.array(item_flat_ids)) - offsets.append(np.cumsum(np.array(item_offsets))) - - pad_to = max(1, max(len(ids) for ids in flat_ids)) - - for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): - flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), - constant_values=0) - offsets[i] = np.pad(offs, (0, pad_to - len(offs)), - constant_values=-1) - - return np.array([flat_ids, offsets], dtype="int32").transpose( - (1, 0, 2)) diff --git a/all_models/inflight_batcher_llm/preprocessing/config.pbtxt b/all_models/inflight_batcher_llm/preprocessing/config.pbtxt deleted file mode 100644 index c113d9cd..00000000 --- a/all_models/inflight_batcher_llm/preprocessing/config.pbtxt +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "preprocessing" -backend: "python" -max_batch_size: 128 -input [ - { - name: "QUERY" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "BAD_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "STOP_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_UINT32 - dims: [ -1 ] - } -] -output [ - { - name: "INPUT_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "REQUEST_INPUT_LEN" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "BAD_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "STOP_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_UINT32 - dims: [ -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "${tokenizer_dir}" - } -} - -parameters { - key: "tokenizer_type" - value: { - string_value: "${tokenizer_type}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/1/.gitkeep b/all_models/inflight_batcher_llm/tensorrt_llm/1/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt deleted file mode 100644 index 3875be00..00000000 --- a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "tensorrt_llm" -backend: "tensorrtllm" -max_batch_size: 128 - -model_transaction_policy { - decoupled: ${decoupled_mode} -} - -input [ - { - name: "input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "request_output_len" - data_type: TYPE_UINT32 - dims: [ 1 ] - }, - { - name: "end_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "pad_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_width" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_k" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "len_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "min_length" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "stop" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "streaming" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - } -] -output [ - { - name: "output_ids" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - } -] -instance_group [ - { - count: 1 - kind : KIND_CPU - } -] -parameters: { - key: "max_beam_width" - value: { - string_value: "1" - } -} -parameters: { - key: "FORCE_CPU_ONLY_INPUT_TENSORS" - value: { - string_value: "no" - } -} -parameters: { - key: "gpt_model_type" - value: { - string_value: "inflight_fused_batching" - } -} -parameters: { - key: "gpt_model_path" - value: { - string_value: "${engine_dir}" - } -} -parameters: { - key: "max_tokens_in_paged_kv_cache" - value: { - string_value: "${max_tokens_in_paged_kv_cache}" - } -} -parameters: { - key: "batch_scheduler_policy" - value: { - string_value: "${batch_scheduler_policy}" - } -} -parameters: { - key: "kv_cache_free_gpu_mem_fraction" - value: { - string_value: "${kv_cache_free_gpu_mem_fraction}" - } -} -parameters: { - key: "max_num_sequences" - value: { - string_value: "${max_num_sequences}" - } -} -parameters: { - key: "enable_trt_overlap" - value: { - string_value: "${enable_trt_overlap}" - } -} diff --git a/build.sh b/build.sh new file mode 100755 index 00000000..e805d566 --- /dev/null +++ b/build.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +git lfs install +git submodule update --init --recursive + +# Default values will be used if not set +BASE_IMAGE=${BASE_IMAGE:-nvcr.io/nvidia/tritonserver:24.11-py3-min} +PYTORCH_IMAGE=${PYTORCH_IMAGE:-nvcr.io/nvidia/pytorch:24.11-py3} +TRT_VERSION=${TRT_VERSION:-10.7.0.23} +TRT_URL_x86=${TRT_URL_x86:-https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-${TRT_VERSION}.Linux.x86_64-gnu.cuda-12.6.tar.gz} +TRT_URL_ARM=${TRT_URL_ARM:-https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-${TRT_VERSION}.ubuntu-24.04.aarch64-gnu.cuda-12.6.tar.gz} + +# Build the TRT-LLM base image that has TRT-LLM installed and will be used as +# the base image for building Triton server and TRT-LLM backend. +docker build -t trtllm_base \ + --build-arg BASE_IMAGE="${BASE_IMAGE}" \ + --build-arg PYTORCH_IMAGE="${PYTORCH_IMAGE}" \ + --build-arg TRT_VER="${TRT_VERSION}" \ + --build-arg RELEASE_URL_TRT_x86="${TRT_URL_x86}" \ + --build-arg RELEASE_URL_TRT_ARM="${TRT_URL_ARM}" \ + -f dockerfile/Dockerfile.triton.trt_llm_backend . + +# Clone the Triton server repository on the same level as the TRT-LLM backend repository. +cd ../ +# Need to use the aligned version of the Triton server repository. +# Refer to the support matrix for the aligned version: https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html +TRITON_SERVER_REPO_TAG=${TRITON_SERVER_REPO_TAG:-r24.11} +git clone -b ${TRITON_SERVER_REPO_TAG} https://github.com/triton-inference-server/server.git +cd server + +# The `TRTLLM_BASE_IMAGE` is the base image that will be used to build the +# container. The `TENSORRTLLM_BACKEND_REPO_TAG` and `PYTHON_BACKEND_REPO_TAG` are +# the tags of the TensorRT-LLM backend and Python backend repositories that will +# be used to build the container. +TRTLLM_BASE_IMAGE=${TRTLLM_BASE_IMAGE:-trtllm_base} +TENSORRTLLM_BACKEND_REPO_TAG=${TENSORRTLLM_BACKEND_REPO_TAG:-v0.15.0} +PYTHON_BACKEND_REPO_TAG=${PYTHON_BACKEND_REPO_TAG:-r24.11} + +TRITON_GITHUB_ORGANIZATION=${TRITON_GITHUB_ORGANIZATION:-} +if [ "$TRITON_GITHUB_ORGANIZATION" != "" ] +then + GITHUB_ORGANIZATION="--github-organization=${TRITON_GITHUB_ORGANIZATION}" +else + GITHUB_ORGANIZATION="" +fi + +TRITON_CONTAINER_PREBUILD_COMMAND=${TRITON_CONTAINER_PREBUILD_COMMAND:-} +if [ "$TRITON_CONTAINER_PREBUILD_COMMAND" != "" ] +then + CONTAINER_PREBUILD_COMMAND="--container-prebuild-command=${TRITON_CONTAINER_PREBUILD_COMMAND}" +else + CONTAINER_PREBUILD_COMMAND="" +fi + +# The flags for some features or endpoints can be removed if not needed. +./build.py -v --no-container-interactive --enable-logging --enable-stats --enable-tracing \ + --enable-metrics --enable-gpu-metrics --enable-cpu-metrics \ + --filesystem=gcs --filesystem=s3 --filesystem=azure_storage \ + --endpoint=http --endpoint=grpc --endpoint=sagemaker --endpoint=vertex-ai \ + --backend=ensemble --enable-gpu --no-container-pull \ + --repoagent=checksum --cache=local --cache=redis \ + --image=base,${TRTLLM_BASE_IMAGE} \ + --backend=tensorrtllm:${TENSORRTLLM_BACKEND_REPO_TAG} \ + --backend=python:${PYTHON_BACKEND_REPO_TAG} \ + "${GITHUB_ORGANIZATION}" "${CONTAINER_PREBUILD_COMMAND}" diff --git a/dockerfile/Dockerfile.triton.trt_llm_backend b/dockerfile/Dockerfile.triton.trt_llm_backend new file mode 100644 index 00000000..8287a0b2 --- /dev/null +++ b/dockerfile/Dockerfile.triton.trt_llm_backend @@ -0,0 +1,205 @@ +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.03-py3-min +ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:25.03-py3 +ARG NVRTC_VER=12.8.61-1 +ARG TRT_VER=10.9.0.34 +ARG RELEASE_URL_TRT_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/tars/TensorRT-${TRT_VER}.Linux.x86_64-gnu.cuda-12.8.tar.gz +ARG RELEASE_URL_TRT_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/tars/TensorRT-${TRT_VER}.Linux.aarch64-gnu.cuda-12.8.tar.gz + +FROM ${PYTORCH_IMAGE} as pytorch_image +FROM ${BASE_IMAGE} as install_dependencies + +ARG CCACHE_REMOTE_STORAGE +ARG CCACHE_URL +ENV CCACHE_DEBUG=1 + +RUN if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \ + curl -k -L ${CCACHE_URL} -o ccache.tar.gz ; \ + tar -xzf ccache.tar.gz -C /usr/local --strip-components=1 ; \ + rm ccache.tar.gz ; \ + ccache --set-config=remote_only=true ; \ + ccache --set-config=remote_storage=${CCACHE_REMOTE_STORAGE} ; \ + ccache --set-config=log_file=/tmp/ccache.log ; \ + ccache -p ; \ + fi + +# Copy PyTorch package from PyTorch image +COPY --from=pytorch_image /usr/local/lib/lib* /usr/local/lib/ +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch /usr/local/lib/python3.12/dist-packages/torch +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch-2.7.0a0+7c8ec84dab.nv25.3.dist-info /usr/local/lib/python3.12/dist-packages/torch-2.7.0a0+7c8ec84dab.nv25.3.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchgen /usr/local/lib/python3.12/dist-packages/torchgen +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision /usr/local/lib/python3.12/dist-packages/torchvision +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision-0.22.0a0.dist-info /usr/local/lib/python3.12/dist-packages/torchvision-0.22.0a0.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision.libs /usr/local/lib/python3.12/dist-packages/torchvision.libs +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools /usr/local/lib/python3.12/dist-packages/setuptools +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools-75.8.2.dist-info /usr/local/lib/python3.12/dist-packages/setuptools-75.8.2.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/functorch /usr/local/lib/python3.12/dist-packages/functorch +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/pytorch_triton-3.2.0+gitb2684bf3b.nvinternal.dist-info /usr/local/lib/python3.12/dist-packages/pytorch_triton-3.2.0+gitb2684bf3b.nvinternal.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/triton /usr/local/lib/python3.12/dist-packages/triton +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2 /usr/local/lib/python3.12/dist-packages/jinja2 +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2-3.1.6.dist-info /usr/local/lib/python3.12/dist-packages/jinja2-3.1.6.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx /usr/local/lib/python3.12/dist-packages/networkx +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx-3.4.2.dist-info /usr/local/lib/python3.12/dist-packages/networkx-3.4.2.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy /usr/local/lib/python3.12/dist-packages/sympy +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy-1.13.1.dist-info /usr/local/lib/python3.12/dist-packages/sympy-1.13.1.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging /usr/local/lib/python3.12/dist-packages/packaging +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging-23.2.dist-info /usr/local/lib/python3.12/dist-packages/packaging-23.2.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn /usr/local/lib/python3.12/dist-packages/flash_attn +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn-2.7.3.dist-info /usr/local/lib/python3.12/dist-packages/flash_attn-2.7.3.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/ + +# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container +COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/ + +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +RUN apt-get update -q=2 && \ + apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + git-lfs && \ + # Remove previous TRT installation + apt-get remove -y tensorrt* libnvinfer* && \ + pip3 uninstall -y tensorrt && \ + rm -rf /var/lib/apt/lists/* + +ARG TRT_VER +ARG NVRTC_VER + +ENV TRT_VERSION=$TRT_VER \ + TRT_VER=$TRT_VER \ + CUDA_VER=$CUDA_VERSION \ + CUDNN_VER=$CUDNN_VERSION \ + NCCL_VER=$NCCL_VERSION \ + CUBLAS_VER=$CUBLAS_VERSION \ + NVRTC_VER="${NVRTC_VER}" + +LABEL TRT_VERSION $TRT_VER + +# Install NVRTC +RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" && \ + curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb && \ + apt install /tmp/cuda-keyring.deb && \ + rm /tmp/cuda-keyring.deb && \ + apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev* && \ + CUDA_VER_SHORT=${CUDA_VER: 0:4} && \ + NVRTC_CUDA_VERSION=${CUDA_VER_SHORT/./-} && \ + apt-get update -qq && \ + apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} && \ + rm -rf /var/lib/apt/lists/* + +# Download & install TRT release +ARG RELEASE_URL_TRT_x86 +ARG RELEASE_URL_TRT_ARM + +RUN [ "$(uname -m)" != "x86_64" ] && RELEASE_URL_TRT=${RELEASE_URL_TRT_ARM} || RELEASE_URL_TRT=${RELEASE_URL_TRT_x86} \ + && curl -fSL -o /tmp/tensorrt.tar.gz ${RELEASE_URL_TRT} \ + # Extract the tarball, excluding Windows libraries and static libraries as + # they are not needed for Linux build + && tar xzvf /tmp/tensorrt.tar.gz --exclude="lib*win.so*" --exclude="*.a" -C /usr/local \ + && rm /tmp/tensorrt.tar.gz \ + && find /usr/local -maxdepth 1 -name Tens* -type d -exec ln -s {} /usr/local/tensorrt \; + +RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))" )* + +ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH} +ENV TRT_ROOT=/usr/local/tensorrt + +FROM install_dependencies as tensorrt_llm_build + +WORKDIR /workspace + +RUN pip3 install --no-cache-dir polygraphy==0.49.9 mpi4py==3.1.5 cmake==3.30.2 + +COPY scripts scripts +COPY tensorrt_llm tensorrt_llm +RUN cd tensorrt_llm && \ + if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \ + python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean --use_ccache ; \ + else \ + python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean ; \ + fi + +# Final stage to build the TRT-LLM container +FROM ${BASE_IMAGE} as final_stage + +# Copy necessary files from the base stage +COPY --from=pytorch_image /usr/local/lib/lib* /usr/local/lib/ +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch /usr/local/lib/python3.12/dist-packages/torch +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch-2.7.0a0+7c8ec84dab.nv25.3.dist-info /usr/local/lib/python3.12/dist-packages/torch-2.7.0a0+7c8ec84dab.nv25.3.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchgen /usr/local/lib/python3.12/dist-packages/torchgen +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision /usr/local/lib/python3.12/dist-packages/torchvision +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision-0.22.0a0.dist-info /usr/local/lib/python3.12/dist-packages/torchvision-0.22.0a0.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision.libs /usr/local/lib/python3.12/dist-packages/torchvision.libs +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools /usr/local/lib/python3.12/dist-packages/setuptools +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools-75.8.2.dist-info /usr/local/lib/python3.12/dist-packages/setuptools-75.8.2.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/functorch /usr/local/lib/python3.12/dist-packages/functorch +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/pytorch_triton-3.2.0+gitb2684bf3b.nvinternal.dist-info /usr/local/lib/python3.12/dist-packages/pytorch_triton-3.2.0+gitb2684bf3b.nvinternal.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/triton /usr/local/lib/python3.12/dist-packages/triton +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2 /usr/local/lib/python3.12/dist-packages/jinja2 +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2-3.1.6.dist-info /usr/local/lib/python3.12/dist-packages/jinja2-3.1.6.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx /usr/local/lib/python3.12/dist-packages/networkx +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx-3.4.2.dist-info /usr/local/lib/python3.12/dist-packages/networkx-3.4.2.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy /usr/local/lib/python3.12/dist-packages/sympy +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy-1.13.1.dist-info /usr/local/lib/python3.12/dist-packages/sympy-1.13.1.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging /usr/local/lib/python3.12/dist-packages/packaging +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging-23.2.dist-info /usr/local/lib/python3.12/dist-packages/packaging-23.2.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn /usr/local/lib/python3.12/dist-packages/flash_attn +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn-2.7.3.dist-info /usr/local/lib/python3.12/dist-packages/flash_attn-2.7.3.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/ + +# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container +COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/ + +ARG NVRTC_VER +ENV CUDA_VER=$CUDA_VERSION \ + NVRTC_VER="${NVRTC_VER}" + +# Install the necessary dependencies and remove previous TRT installation in the +# final image +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +RUN apt-get update -q=2 && \ + apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + python-is-python3 \ + git-lfs && \ + apt-get remove -y tensorrt* libnvinfer* && \ + rm -rf /var/lib/apt/lists/* && \ + pip3 uninstall -y tensorrt && \ + pip3 install --no-cache-dir polygraphy==0.49.9 mpi4py==3.1.5 + +# Install NVRTC +RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" && \ + curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb && \ + apt install /tmp/cuda-keyring.deb && \ + rm /tmp/cuda-keyring.deb && \ + apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev* && \ + CUDA_VER_SHORT=${CUDA_VER: 0:4} && \ + NVRTC_CUDA_VERSION=${CUDA_VER_SHORT/./-} && \ + apt-get update -qq && \ + apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} && \ + rm -rf /var/lib/apt/lists/* + +# Install TRT +COPY --from=install_dependencies /usr/local/tensorrt /usr/local/tensorrt +RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))" )* + +# Set environment variables +ARG TRT_VER +ENV TRT_VERSION=$TRT_VER +ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH} +ENV TRT_ROOT=/usr/local/tensorrt + +WORKDIR /tmp + +# Install TRT-LLM wheel after all the dependencies are installed +COPY --from=tensorrt_llm_build /workspace/tensorrt_llm/build/tensorrt_llm*whl . +RUN pip3 install --no-cache-dir tensorrt_llm*.whl && \ + rm -f tensorrt_llm*.whl + +# Copying the scripts +WORKDIR /app +COPY scripts scripts +COPY all_models all_models +COPY inflight_batcher_llm/client client +COPY tools tools +COPY tensorrt_llm/examples examples diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend deleted file mode 100644 index efb55586..00000000 --- a/dockerfile/Dockerfile.trt_llm_backend +++ /dev/null @@ -1,53 +0,0 @@ -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver -ARG BASE_TAG=23.08-py3 - -FROM ${BASE_IMAGE}:${BASE_TAG} as base - -RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev python-is-python3 - -COPY requirements.txt /tmp/ -RUN pip3 install -r /tmp/requirements.txt --extra-index-url https://pypi.ngc.nvidia.com - -# Remove prevous TRT installation -# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries. -RUN apt-get remove --purge -y tensorrt* -RUN pip uninstall -y tensorrt - -FROM base as dev - -# Download & install internal TRT release -COPY tensorrt_llm/docker/common/install_tensorrt.sh /tmp/ -RUN bash /tmp/install_tensorrt.sh && rm /tmp/install_tensorrt.sh -ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH} -ENV TRT_ROOT=/usr/local/tensorrt - -# Install latest Polygraphy -COPY tensorrt_llm/docker/common/install_polygraphy.sh /tmp/ -RUN bash /tmp/install_polygraphy.sh && rm /tmp/install_polygraphy.sh - -# CMake -COPY tensorrt_llm/docker/common/install_cmake.sh /tmp/ -RUN bash /tmp/install_cmake.sh && rm /tmp/install_cmake.sh -ENV PATH="/usr/local/cmake/bin:${PATH}" - -COPY tensorrt_llm/requirements-dev.txt /tmp/ -RUN pip install -r /tmp/requirements-dev.txt --extra-index-url https://pypi.ngc.nvidia.com - -FROM dev as trt_llm_builder - -WORKDIR /app -COPY scripts scripts -COPY tensorrt_llm tensorrt_llm -RUN cd tensorrt_llm && python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" -i -c && cd .. - -FROM trt_llm_builder as trt_llm_backend_builder - -WORKDIR /app/ -COPY inflight_batcher_llm inflight_batcher_llm -RUN cd inflight_batcher_llm && bash scripts/build.sh && cd .. - -FROM trt_llm_backend_builder as final - -# Install tensorrtllm backend -RUN mkdir /opt/tritonserver/backends/tensorrtllm -COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm diff --git a/docs/baichuan.md b/docs/baichuan.md new file mode 100644 index 00000000..3d383330 --- /dev/null +++ b/docs/baichuan.md @@ -0,0 +1,412 @@ + +## End to end workflow to run baichuan + +* Build engine + +```bash +export HF_BAICHUAN_MODEL=Baichuan-13B-Chat/ +python build.py --model_dir ${HF_BAICHUAN_MODEL} \ + --dtype float16 \ + --remove_input_padding \ + --use_gpt_attention_plugin float16 \ + --enable_context_fmha \ + --use_gemm_plugin float16 \ + --output_dir /tmp/baichuan/13B/trt_engines/fp16/1-gpu/ \ + --kv_cache_type paged \ + --max_batch_size 64 + +[11/29/2023-08:20:34] [TRT] [I] Total Host Persistent Memory: 77008 +[11/29/2023-08:20:34] [TRT] [I] Total Device Persistent Memory: 0 +[11/29/2023-08:20:34] [TRT] [I] Total Scratch Memory: 1342439424 +[11/29/2023-08:20:34] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 690 steps to complete. +[11/29/2023-08:20:34] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 25.5938ms to assign 11 blocks to 690 nodes requiring 6308236288 bytes. +[11/29/2023-08:20:34] [TRT] [I] Total Activation Memory: 6308236288 +[11/29/2023-08:20:35] [TRT] [I] Total Weights Memory: 26529804072 +[11/29/2023-08:20:35] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +64, now: CPU 56027, GPU 28529 (MiB) +[11/29/2023-08:20:35] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +72, now: CPU 56027, GPU 28601 (MiB) +[11/29/2023-08:20:35] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 1250 MiB, GPU 41088 MiB +[11/29/2023-08:20:35] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in building engine: CPU +0, GPU +25301, now: CPU 0, GPU 25301 (MiB) +[11/29/2023-08:20:44] [TRT] [I] [MemUsageStats] Peak memory usage during Engine building and serialization: CPU: 81260 MiB +[11/29/2023-08:20:44] [TRT-LLM] [I] Total time of building baichuan_float16_tp1_rank0.engine: 00:00:37 +[11/29/2023-08:20:44] [TRT-LLM] [I] Config saved to /tmp/baichuan/13B/trt_engines/fp16/1-gpu/config.json. +[11/29/2023-08:20:45] [TRT-LLM] [I] Serializing engine to /tmp/baichuan/13B/trt_engines/fp16/1-gpu/baichuan_float16_tp1_rank0.engine... +[11/29/2023-08:21:35] [TRT-LLM] [I] Engine serialized. Total time: 00:00:49 +[11/29/2023-08:21:36] [TRT-LLM] [I] Timing cache serialized to /tmp/baichuan/13B/trt_engines/fp16/1-gpu/model.cache +[11/29/2023-08:21:36] [TRT-LLM] [I] Total time of building all 1 engines: 00:05:00 +``` + +* Prepare configs + +```bash +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ baichuan_ifb -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 +```` + +* Launch server + +```bash +pip install SentencePiece +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/ +``` + +this setting requires about 35GB + +```bash +nvidia-smi + +Wed Nov 29 08:33:50 2023 ++---------------------------------------------------------------------------------------+ +| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 | +|-----------------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+======================+======================| +| 0 NVIDIA H100 PCIe On | 00000000:41:00.0 Off | 0 | +| N/A 43C P0 81W / 350W | 34743MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+----------------------+----------------------+ + ++---------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=======================================================================================| ++---------------------------------------------------------------------------------------+ +``` + +If you encounter error + +```bash +I1129 08:28:33.267969 15088 model_lifecycle.cc:818] successfully loaded 'tensorrt_llm_bls' +I1129 08:28:33.928915 15088 pb_stub.cc:325] Failed to initialize Python stub: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported. + +At: + /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained + /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/preprocessing/1/model.py(66): initialize + +I1129 08:28:33.928991 15088 pb_stub.cc:325] Failed to initialize Python stub: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported. + +At: + /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained + /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/postprocessing/1/model.py(65): initialize + +E1129 08:28:34.285773 15088 backend_model.cc:634] ERROR: Failed to create instance: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported. + +At: + /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained + /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/postprocessing/1/model.py(65): initialize + +E1129 08:28:34.285879 15088 model_lifecycle.cc:621] failed to load 'postprocessing' version 1: Internal: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported. + +At: + /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained + /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/postprocessing/1/model.py(65): initialize + +I1129 08:28:34.285894 15088 model_lifecycle.cc:756] failed to load 'postprocessing' +E1129 08:28:34.304925 15088 backend_model.cc:634] ERROR: Failed to create instance: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported. + +At: + /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained + /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/preprocessing/1/model.py(66): initialize + +E1129 08:28:34.305028 15088 model_lifecycle.cc:621] failed to load 'preprocessing' version 1: Internal: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported. + +At: + /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained + /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/preprocessing/1/model.py(66): initialize + +I1129 08:28:34.305052 15088 model_lifecycle.cc:756] failed to load 'preprocessing' +``` + +please add `trust_remote_code=True` in tokenizer of preprocessing and postprocessing. Considering the security, we don't add it by default. + +* Send request + +```bash +curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2}' + +{"cum_log_probs":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\nMachine learning is a subset of artificial intelligence (AI) that focuses on the"} +``` + +* Send request with bad_words and stop_words + +```bash +curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": ["intelligence","model"], "stop_words": ["focuses","learn"], "pad_id": 2, "end_id": 2}' + +{"cum_log_probs":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\nMachine learning is a subset of artificial intelligent (AI) that focuses"} +``` + +* Send request by `inflight_batcher_llm_client.py` (Remember to add `trust_remote_code=True` in tokenizer of `inflight_batcher_llm_client.py`) + +```bash +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL} + +========= +Input sequence: [16448, 677, 5611, 31136, 21309, 4746, 31125, 694, 1033, 653, 8808, 754, 650] +Got completed request +Input: Born in north-east France, Soyer trained as a +Output beam 0: . He became the chef at the Reform Club, and later at the Vegetarian Restaurant, where he pioneered the use of vegetables in fine dining. He also wrote a number of books, including The London Art of Cookery (1858), The Modern Housekeeper (1861), and The Compleat Housekeeper (1862). +Soyer was a strong supporter of the British National Rifle Association, and was a member of the organisation's council. He was also a member of the Reform Club, the Athenaeum, and the Rifle Club. He died in London in 1904. +Soyer was born in the village of Montigny-lès-Cormeilles, in the department of Aisne, France. He was the son of a baker, and was educated in the +Output sequence: [16814, 677, 5621, 1412, 4514, 678, 2835, 677, 31106, 53, 60, 57, 59, 79, 1057, 3142, 656, 16814, 772, 656, 15824, 4305, 31125, 680, 2384, 772, 656, 9592, 1161, 8480, 13550, 807, 31125, 1238, 742, 11135, 2521, 656, 1226, 679, 8431, 3392, 677, 4816, 8946, 79, 1057, 982, 4251, 650, 1697, 679, 3594, 31125, 1516, 776, 2835, 2409, 679, 7782, 1620, 762, 53, 60, 57, 60, 1098, 776, 8753, 2542, 17655, 762, 53, 60, 58, 53, 1098, 680, 776, 1127, 1596, 658, 2542, 17655, 762, 53, 60, 58, 54, 31145, 79, 5, 31131, 1033, 653, 796, 650, 2427, 23747, 679, 656, 3681, 2024, 751, 19422, 2790, 728, 31125, 680, 796, 650, 2736, 679, 656, 1625, 4859, 31155, 31114, 7284, 79, 1057, 796, 982, 650, 2736, 679, 656, 15824, 4305, 31125, 656, 1996, 1179, 4302, 784, 31125, 680, 656, 751, 19422, 4305, 79, 1057, 4357, 677, 2835, 677, 31106, 53, 61, 52, 56, 79, 5, 31131, 1033, 653, 796, 4204, 677, 656, 6730, 679, 5136, 942, 31124, 31136, 31115, 16987, 31136, 31133, 908, 31107, 22542, 31125, 677, 656, 1664, 2049, 679, 703, 667, 1024, 31125, 4746, 79, 1057, 796, 656, 3652, 679, 650, 675, 3034, 31125, 680, 796, 18735, 677, 656] +``` + +* Run test on dataset + +``` +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500 + +[INFO] Start testing on 13 prompts. +[INFO] Functionality test succeed. +[INFO] Warm up for benchmarking. +[INFO] Start benchmarking on 13 prompts. +[INFO] Total Latency: 1598.328 ms +``` + +* Run with decoupled mode (streaming) + +```bash +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ baichuan_ifb -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + +pip install SentencePiece +# please add `trust_remote_code=True` in tokenizer of preprocessing and postprocessing. Considering the security, we don't add it by default. +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/ + +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL} --streaming +``` + +
+ The result would be like + + +```bash +========= +Input sequence: [16448, 677, 5611, 31136, 21309, 4746, 31125, 694, 1033, 653, 8808, 754, 650] +[16814] +[677] +[5621] +[1412] +[4514] +[678] +[2835] +[677] +[31106] +[53] +[60] +[57] +[59] +[79] +[1057] +[3142] +[656] +[16814] +[772] +[656] +[15824] +[4305] +[31125] +[680] +[2384] +[772] +[656] +[9592] +[1161] +[8480] +[13550] +[807] +[31125] +[1238] +[742] +[11135] +[2521] +[656] +[1226] +[679] +[8431] +[3392] +[677] +[4816] +[8946] +[79] +[1057] +[982] +[4251] +[650] +[1697] +[679] +[3594] +[31125] +[1516] +[776] +[2835] +[2409] +[679] +[7782] +[1620] +[762] +[53] +[60] +[57] +[60] +[1098] +[776] +[8753] +[2542] +[17655] +[762] +[53] +[60] +[58] +[53] +[1098] +[680] +[776] +[1127] +[1596] +[658] +[2542] +[17655] +[762] +[53] +[60] +[58] +[54] +[31145] +[79] +[5] +[31131] +[1033] +[653] +[796] +[650] +[2427] +[23747] +[679] +[656] +[3681] +[2024] +[751] +[19422] +[2790] +[728] +[31125] +[680] +[796] +[650] +[2736] +[679] +[656] +[1625] +[4859] +[31155] +[31114] +[7284] +[79] +[1057] +[796] +[982] +[650] +[2736] +[679] +[656] +[15824] +[4305] +[31125] +[656] +[1996] +[1179] +[4302] +[784] +[31125] +[680] +[656] +[751] +[19422] +[4305] +[79] +[1057] +[4357] +[677] +[2835] +[677] +[31106] +[53] +[61] +[52] +[56] +[79] +[5] +[31131] +[1033] +[653] +[796] +[4204] +[677] +[656] +[6730] +[679] +[5136] +[942] +[31124] +[31136] +[31115] +[16987] +[31136] +[31133] +[908] +[31107] +[22542] +[31125] +[677] +[656] +[1664] +[2049] +[679] +[703] +[667] +[1024] +[31125] +[4746] +[79] +[1057] +[796] +[656] +[3652] +[679] +[650] +[675] +[3034] +[31125] +[680] +[796] +[18735] +[677] +[656] +Input: Born in north-east France, Soyer trained as a +Output beam 0: chef in Paris before moving to London in 1857. He became the chef at the Reform Club, and later at the Vegetarian Restaurant, where he pioneered the use of vegetables in fine dining. He also wrote a number of books, including The London Art of Cookery (1858), The Modern Housekeeper (1861), and The Compleat Housekeeper (1862). +Soyer was a strong supporter of the British National Rifle Association, and was a member of the organisation's council. He was also a member of the Reform Club, the Athenaeum, and the Rifle Club. He died in London in 1904. +Soyer was born in the village of Montigny-lès-Cormeilles, in the department of Aisne, France. He was the son of a baker, and was educated in the +Output sequence: [16448, 677, 5611, 31136, 21309, 4746, 31125, 694, 1033, 653, 8808, 754, 650, 16814, 677, 5621, 1412, 4514, 678, 2835, 677, 31106, 53, 60, 57, 59, 79, 1057, 3142, 656, 16814, 772, 656, 15824, 4305, 31125, 680, 2384, 772, 656, 9592, 1161, 8480, 13550, 807, 31125, 1238, 742, 11135, 2521, 656, 1226, 679, 8431, 3392, 677, 4816, 8946, 79, 1057, 982, 4251, 650, 1697, 679, 3594, 31125, 1516, 776, 2835, 2409, 679, 7782, 1620, 762, 53, 60, 57, 60, 1098, 776, 8753, 2542, 17655, 762, 53, 60, 58, 53, 1098, 680, 776, 1127, 1596, 658, 2542, 17655, 762, 53, 60, 58, 54, 31145, 79, 5, 31131, 1033, 653, 796, 650, 2427, 23747, 679, 656, 3681, 2024, 751, 19422, 2790, 728, 31125, 680, 796, 650, 2736, 679, 656, 1625, 4859, 31155, 31114, 7284, 79, 1057, 796, 982, 650, 2736, 679, 656, 15824, 4305, 31125, 656, 1996, 1179, 4302, 784, 31125, 680, 656, 751, 19422, 4305, 79, 1057, 4357, 677, 2835, 677, 31106, 53, 61, 52, 56, 79, 5, 31131, 1033, 653, 796, 4204, 677, 656, 6730, 679, 5136, 942, 31124, 31136, 31115, 16987, 31136, 31133, 908, 31107, 22542, 31125, 677, 656, 1664, 2049, 679, 703, 667, 1024, 31125, 4746, 79, 1057, 796, 656, 3652, 679, 650, 675, 3034, 31125, 680, 796, 18735, 677, 656] +``` + +
+ + +* Run several requests at the same time + +```bash +echo '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2}' > tmp.txt +printf '%s\n' {1..20} | xargs -I % -P 20 curl -X POST localhost:8000/v2/models/ensemble/generate -d @tmp.txt +``` diff --git a/docs/build.md b/docs/build.md new file mode 100644 index 00000000..54232dfb --- /dev/null +++ b/docs/build.md @@ -0,0 +1,36 @@ +# Building from Source + +This document describes how to build the TensorRT-LLM backend and the Triton +TRT-LLM container from source. The Triton container includes TensorRT-LLM, +along with the TensorRT-LLM backend and the Python backend. + +## Build the TensorRT-LLM Backend from source + +Make sure TensorRT-LLM is installed before building the backend. Since the +version of TensorRT-LLM and the TensorRT-LLM backend has to be aligned, it is +recommended to directly use the Triton TRT-LLM container from NGC or build the +whole container from source as described below in the Build the Docker Container +section. + +```bash +cd tensorrt_llm/triton_backend/inflight_batcher_llm +bash scripts/build.sh +``` + +## Build the Docker Container + +### Build using `build.sh` + +```bash +bash ../build.sh +``` + +#### Build via Docker + +You can build the container using the instructions in the [TensorRT-LLM Docker Build](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/README.md) +with `tritonrelease` stage. + +```bash +cd tensorrt_llm/ +make -C docker tritonrelease_build +``` diff --git a/docs/encoder_decoder.md b/docs/encoder_decoder.md new file mode 100755 index 00000000..56af8bfe --- /dev/null +++ b/docs/encoder_decoder.md @@ -0,0 +1,402 @@ +# End to end workflow to run an Encoder-Decoder model + +### Support Matrix +For the specific models supported by encoder-decoder family, please visit [TensorRT-LLM encoder-decoder examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec#encoder-decoder-model-support). The following two model types are supported: +* T5 +* BART + +## Run Encoder-Decoder with Tritonserver +### Tritonserver setup steps + +#### 1. Make sure that you have initialized the TRT-LLM submodule: + +``` + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend + git lfs install + git submodule update --init --recursive +``` + +#### 2. Start the Triton Server Docker container within `tensorrtllm_backend` repo: + +If you're using [Triton TRT-LLM NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags) + +``` + # Replace with the version of Triton you want to use. Here using 24.08. + # The commands below assumes the the current directory is the + # TRT-LLM backend root git repository. + + docker run --gpus all --ipc=host --ulimit memlock=-1 --shm-size=20g `pwd`:/workspace -w /workspace nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 bash +``` + +If [building your own TensorRT-LLM Backend container](https://github.com/triton-inference-server/tensorrtllm_backend#option-2-build-via-docker) then you can run the `tensorrtllm_backend` container: + +``` + docker run --gpus all --ipc=host --ulimit memlock=-1 --shm-size=20g `pwd`:/workspace -w /workspace triton_trt_llm bash +``` + +#### 3. Build the engines: + +Clone the target model repository from HuggingFace. Here we use [T5-small model](https://huggingface.co/google-t5/t5-small) as example but you can also follow the same steps for BART model. + + + git lfs install + git clone https://huggingface.co/google-t5/t5-small /workspace/hf_models/t5-small + + +Build TensorRT-LLM engines. + +``` + export MODEL_NAME=t5-small # or bart-base + export MODEL_TYPE=t5 # or bart + export HF_MODEL_PATH=/workspace/hf_models/${MODEL_NAME} + export UNIFIED_CKPT_PATH=/workspace/ckpt/${MODEL_NAME} + export ENGINE_PATH=/workspace/engines/${MODEL_NAME} + export INFERENCE_PRECISION=float16 + export TP_SIZE=1 + export MAX_BEAM_WIDTH=1 + export MAX_BATCH_SIZE=8 + export INPUT_LEN=1024 + export OUTPUT_LEN=201 + + python3 tensorrt_llm/examples/enc_dec/convert_checkpoint.py \ + --model_type ${MODEL_TYPE} \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype ${INFERENCE_PRECISION} \ + --tp_size ${TP_SIZE} + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/encoder \ + --output_dir ${ENGINE_PATH}/encoder \ + --kv_cache_type disabled \ + --moe_plugin disable \ + --max_beam_width ${MAX_BEAM_WIDTH} \ + --max_input_len ${INPUT_LEN} \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --gemm_plugin ${INFERENCE_PRECISION} \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --gpt_attention_plugin ${INFERENCE_PRECISION} \ + --context_fmha disable # remove for BART + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/decoder \ + --output_dir ${ENGINE_PATH}/decoder \ + --moe_plugin disable \ + --max_beam_width ${MAX_BEAM_WIDTH} \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --gemm_plugin ${INFERENCE_PRECISION} \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --gpt_attention_plugin ${INFERENCE_PRECISION} \ + --max_input_len 1 \ + --max_encoder_input_len ${INPUT_LEN} \ + --max_seq_len ${OUTPUT_LEN} \ + --context_fmha disable # remove for BART +``` + +> **NOTE** +> +> If you want to build multi-GPU engine using Tensor Parallelism then you can set `--tp_size` in convert_checkpoint.py. For example, for TP=2 on 2-GPU you can set `--tp_size=2`. If you want to use beam search then set `--max_beam_width` to higher value than 1. The `--max_input_len` in encoder trtllm-build controls the model input length and should be same as `--max_encoder_input_len` in decoder trtllm-build. Additionally, to control the model output len you should set `--max_seq_len` in decoder trtllm-build to `desired output length + 1`. It is also advisable to tune [`--max_num_tokens`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md#max_num_tokens) as the default value of 8192 might be too large or too small depending on your input, output len and use-cases. For BART family models, make sure to remove `--context_fmha disable` from both encoder and decoder trtllm-build commands. Please refer to [TensorRT-LLM enc-dec example](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec#build-tensorrt-engines) for more details. + +#### 4. Prepare Tritonserver configs + +``` + cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ enc_dec_ifb -r + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:${MAX_BEAM_WIDTH},engine_dir:${ENGINE_PATH}/decoder,encoder_engine_dir:${ENGINE_PATH}/encoder,kv_cache_free_gpu_mem_fraction:0.8,cross_kv_cache_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,max_queue_size:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 + +``` + +> **NOTE** +> +> Currently, encoder-decoder models don't support running with chunked context. + +#### 5. Launch Tritonserver + +``` +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=enc_dec_ifb/ +``` + +### Send requests +#### 1. Send request with CURL + +``` +curl -X POST localhost:8000/v2/models/ensemble/generate -d "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": \"\", \"stop_words\": \"\"}" + + {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":0.0,"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":":::: (CNN): (CNN): (CNN) the Hit & Run Tour to the U.S. for the first time. the Hit & Run Tour will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Tickets will go on sale Monday, March 9 at 10 a.m. local time."} +``` + +#### 2. Send request with `bad_words` and `stop_words` + +After applying the `stop_words` and `bad_words`, the output avoids the bad words and stops at the first generated stop word. + +``` +curl -X POST localhost:8000/v2/models/ensemble/generate -d "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": [\"drummer\", \"hometown\"], \"stop_words\": [\"Tickets\", \"sale\"]}" + + {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":0.0,"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":":::: (CNN): (CNN): (CNN) the Hit & Run Tour to the U.S. for the first time. the Hit & Run Tour will take place in Louisville, Kentucky, the home of 3rdEyeGirl's Hannah Welton."} +``` + +#### 3. Send request by `inflight_batcher_llm_client.py` +If not already installed, install `tritonclient` + +``` + pip install tritonclient[all] + python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --text "translate English to German: This is good" --request-output-len 200 --exclude-input-in-output --tokenizer-dir ${HF_MODEL_PATH} --beam-width ${MAX_BEAM_WIDTH} + + ======== + Using pad_id: 0 + Using end_id: 1 + Input sequence: [13959, 1566, 12, 2968, 10, 100, 19, 207, 1] + [TensorRT-LLM][WARNING] decoder_input_ids is not present in the request for encoder-decoder model. The decoder input tokens will be set to [padId] + Got completed request + Input: translate English to German: This is good + Output beam 0: Das is gut. + Output sequence: [644, 229, 1806, 5] +``` + +> **NOTE** +> +> Please ignore any exception thrown with the output. It's a known issue to be fixed. + +#### 4. Run test on dataset + +``` + python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 + + [INFO] Start testing on 13 prompts. + [INFO] Functionality test succeed. + [INFO] Warm up for benchmarking. + [INFO] Start benchmarking on 13 prompts. + [INFO] Total Latency: 155.756 ms +``` + +#### 5. Run several requests at the same time + +``` +echo "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": [\"drummer\", \"hometown\"], \"stop_words\": [\"Tickets\", \"sale\"]}" > tmp.txt + +printf '%s\n' {1..20} | xargs -I % -P 20 curl -X POST localhost:8000/v2/models/ensemble/generate -d @tmp.txt +``` +#### 6. Evaluating performance with Gen-AI Perf + +Gen-AI Perf is a command line tool for measuring the throughput and latency of generative AI models as served through an inference server. You can read more about installing Gen-AI Perf [here](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html#installation). + +To use Gen-AI Perf, run the following command: + +``` +genai-perf profile \ + -m ensemble \ + --service-kind triton \ + --backend tensorrtllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --tokenizer ${HF_MODEL_PATH} \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +You should expect an output that looks like this (the output below was obtained on A100-80GB with TRT-LLM v0.12): + +``` LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ +│ Request latency (ms) │ 80.92 │ 78.84 │ 323.55 │ 85.14 │ 79.90 │ 79.64 │ +│ Output sequence length │ 95.83 │ 65.00 │ 100.00 │ 100.00 │ 99.00 │ 98.00 │ +│ Input sequence length │ 200.01 │ 200.00 │ 201.00 │ 200.00 │ 200.00 │ 200.00 │ +└────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘ +Output token throughput (per sec): 1182.70 +Request throughput (per sec): 12.34 +``` + +#### 7. Run with decoupled mode (streaming) + +To enable streaming, we set `decoupled_mode:True` in config.pbtxt of `tensorrt_llm` and `tensorrt_llm_bls` model (if you are using BLS instead of ensemble). + +``` + cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ enc_dec_ifb -r + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:True,max_beam_width:${MAX_BEAM_WIDTH},engine_dir:${ENGINE_PATH}/decoder,encoder_engine_dir:${ENGINE_PATH}/encoder,kv_cache_free_gpu_mem_fraction:0.8,cross_kv_cache_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,max_queue_size:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 + +``` + +We launch Tritonserver + +``` +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=enc_dec_ifb/ +``` + +Then send request by `inflight_batcher_llm_client.py` + +``` +pip install tritonclient[all] +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --text "translate English to German: This is good" --request-output-len 200 --exclude-input-in-output --tokenizer-dir ${HF_MODEL_PATH} --beam-width ${MAX_BEAM_WIDTH} --streaming +``` + +To use Gen-AI Perf to benchmark streaming/decoupled mode, run the following command: + +``` +genai-perf profile \ + -m ensemble \ + --service-kind triton \ + --backend tensorrtllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --tokenizer ${HF_MODEL_PATH} \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +You should see output like this (the output below was obtained on A100-80GB with TRT-LLM v0.12) + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ +│ Time to first token (ms) │ 4.69 │ 3.99 │ 14.05 │ 5.70 │ 5.04 │ 4.76 │ +│ Inter token latency (ms) │ 0.63 │ 0.38 │ 1.04 │ 0.98 │ 0.70 │ 0.66 │ +│ Request latency (ms) │ 75.32 │ 46.34 │ 114.27 │ 90.35 │ 79.27 │ 79.11 │ +│ Output sequence length │ 116.50 │ 58.00 │ 197.00 │ 197.00 │ 132.00 │ 128.00 │ +│ Input sequence length │ 200.01 │ 200.00 │ 201.00 │ 200.10 │ 200.00 │ 200.00 │ +└──────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘ +Output token throughput (per sec): 1542.81 +Request throughput (per sec): 13.24 +``` + +## Running multiple instances of encoder-decoder model on multiple GPUs + +In this section, we demonstrate how you can use +[Leader Mode](../README.md#leader-mode) for running multiple instances of a encoder-decoder model on different GPUs. + +For this section, let's assume that we have four GPUs and the CUDA device ids +are 0, 1, 2, and 3. We will be launching two instances of the T5-small model +with tensor parallelism 2 (TP=2). The first instance will run on GPUs 0 and 1 +and the second instance will run on GPUs 2 and 3. We will launch two separate `mpirun` commands to launch two separate Triton servers, one for each GPU (4 Triton Server instances in total). We also need to use a reverse proxy in front of them to load balance the requests between the servers. + +[Orchestrator Mode](../README.md#orchestrator-mode) currently not supported. + + +### Triton setup steps +1. Build the model, but add `--tp_size 2` when converting checkpoints. The rest of the steps are the same as [Tritonserver Setup +](#Tritonserver-setup-steps). + +``` + export MODEL_NAME=t5-small + export MODEL_TYPE=t5 # or bart + export HF_MODEL_PATH=/workspace/hf_models/${MODEL_NAME} + export UNIFIED_CKPT_PATH=/workspace/ckpt/${MODEL_NAME}-2tp-2gpu + export ENGINE_PATH=/workspace/engines/${MODEL_NAME}-2tp-2gpu + + python tensorrt_llm/examples/enc_dec/convert_checkpoint.py \ + --model_type ${MODEL_TYPE} \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 \ + --tp_size 2 + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/encoder \ + --output_dir ${ENGINE_PATH}/encoder \ + --kv_cache_type disabled \ + --moe_plugin disable \ + --max_batch_size 64 \ + --gemm_plugin float16 \ + --bert_attention_plugin float16 \ + --gpt_attention_plugin float16 \ + --max_input_len 2048 \ + --context_fmha disable + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/decoder \ + --output_dir ${ENGINE_PATH}/decoder \ + --moe_plugin disable \ + --max_batch_size 64 \ + --gemm_plugin float16 \ + --bert_attention_plugin float16 \ + --gpt_attention_plugin float16 \ + --context_fmha disable \ + --max_input_len 1 \ + --max_encoder_input_len 2048 +``` + +3. Setup Tritonserver config with the same commands in [step 4](#prepare-tritonserver-configs) above. + +4. Launch the servers: + +``` + CUDA_VISIBLE_DEVICES=0,1 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=enc_dec_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004 + CUDA_VISIBLE_DEVICES=2,3 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=enc_dec_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005 +``` + +4. Install NGINX: + +``` + apt update + apt install nginx -y +``` + +5. Setup the NGINX configuration and store it in `/etc/nginx/sites-available/tritonserver`: + +``` + upstream tritonserver { + server localhost:8000; + server localhost:8002; + } + + server { + listen 8080; + + location / { + proxy_pass http://tritonserver; + } + } +``` + +6. Create a symlink and restart NGINX to enable the configuration: + +``` + ln -s /etc/nginx/sites-available/tritonserver /etc/nginx/sites-enabled/tritonserver + service nginx restart +``` + +### Send the request + +1. Run test on dataset + +``` + # Test the load on all the servers + python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080 + + # Test the load on one of the servers + python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000 +``` + +### Kill the server +``` +pgrep mpirun | xargs kill +``` diff --git a/docs/gemma.md b/docs/gemma.md new file mode 100644 index 00000000..f8959ec6 --- /dev/null +++ b/docs/gemma.md @@ -0,0 +1,50 @@ +## End to end workflow to run sp model + +* Build engine + +assume tokenizer model is put in `/tmp/gemma/tmp_vocab.model` and the engine is put in `/tmp/gemma/2B/bf16/1-gpu/`. + +```bash +TOKENIZER_DIR=/tmp/models/gemma_nv/checkpoints/tmp_vocab.model +ENGINE_PATH=/tmp/gemma/2B/bf16/1-gpu/ +``` + +* Prepare configs + +Note that we use `tokenizer_type=sp` (sentencepiece) tokenizer. + +```bash +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ gemma -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,preprocessing_instance_count:1,add_special_tokens:True +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,batch_scheduler_policy:guaranteed_no_evict,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + +``` + +* Launch server + +```bash +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=gemma/ +``` + + +* Send request + +```bash +curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": ""}' + +{"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\n\nMachine learning is a branch of artificial intelligence that allows computers to learn from data without being explicitly programmed"} +``` + +* Send request with bad_words and stop_words + +```bash +curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": [" intelligence", " allows"], "stop_words": [" computers", "learn"]}' + +{"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\n\nMachine learning is a branch of artificial intelligent that enables computers"} +``` + +The words ` intelligence` and ` allows` are replaced by ` intelligent` and ` enables`, and the generation stops when generating ` computers`. diff --git a/docs/guided_decoding.md b/docs/guided_decoding.md new file mode 100644 index 00000000..473dcaba --- /dev/null +++ b/docs/guided_decoding.md @@ -0,0 +1,128 @@ +# End-to-End Workflow for Guided Decoding with TensorRT-LLM Backend + +This document outlines the process for running guided decoding using the TensorRT-LLM backend. Guided decoding ensures that generated outputs adhere to specified formats, such as JSON. Currently, this feature is supported through the [XGrammar](https://github.com/mlc-ai/xgrammar) backend. + +For more information, refer to the [guided decoding documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/executor.md#structured-output-with-guided-decoding) from TensorRT-LLM. Additionally, you can explore another example of [guided decoding + LLM API example](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_guided_decoding.html). + +## Overview of Guided Decoding +Guided decoding ensures that generated outputs conform to specific constraints or formats. Supported guide types include: +- **None**: No constraints. +- **JSON**: Outputs in JSON format. +- **JSON Schema**: JSON format with schema validation. +- **Regex**: Outputs matching a regular expression. +- **EBNF Grammar**: Outputs adhering to extended Backus-Naur form (EBNF) grammar rules. + +# Build TensorRT-LLM engine and launch Tritonserver + +From this point, we assume you installed all requirements for tensorrtllm_backend. You can refer to [build.md](build.md) for installation and docker launch. + +## Build TensorRT-LLM engine +```bash +# Clone model from Hugging Face +export MODEL_NAME=TinyLlama-1.1B-Chat-v1.0 +git clone https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0 hf_models/${MODEL_NAME} + +export HF_MODEL_PATH=hf_models/${MODEL_NAME} +export UNIFIED_CKPT_PATH=trt_ckpts/tiny_llama_1b/1-gpu/fp16 +export ENGINE_PATH=trt_engines/tiny_llama_1b/1-gpu/fp16 + +python tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + +trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --remove_input_padding enable \ + --gpt_attention_plugin float16 \ + --context_fmha enable \ + --gemm_plugin float16 \ + --output_dir ${ENGINE_PATH} \ + --kv_cache_type paged \ + --max_batch_size 64 +``` +## Launch Tritonserver + +## Python Backend +```bash +export GUIDED_DECODING_BACKEND=xgrammar +export TRITON_BACKEND=python + +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ llama_ifb -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,tokenizer_dir:${HF_MODEL_PATH},guided_decoding_backend:${GUIDED_DECODING_BACKEND} + +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ +``` + +## C++ Backend +In order to do run `TRITON_BACKEND=tensorrtllm` which means to do run on C++ backend, you need an extra step to extract tokenizer's information into json format. `generate_xgrammar_tokenizer_info.py` will create `xgrammar_tokenizer_info.json` under given output_dir argument. And we fill the `xgrammer_tokenizer_info_path` parameter in `tensorrt_llm/config.pbtxt`. +```bash +export XGRAMMAR_TOKENIZER_INFO_DIR=tokenizer_info/${MODEL_NAME} + +python3 tensorrt_llm/examples/generate_xgrammar_tokenizer_info.py --model_dir ${HF_MODEL_PATH} --output_dir ${XGRAMMAR_TOKENIZER_INFO_DIR} + +export XGRAMMAR_TOKENIZER_INFO_PATH=tokenizer_info/${MODEL_NAME}/xgrammar_tokenizer_info.json +export GUIDED_DECODING_BACKEND=xgrammar +export TRITON_BACKEND=tensorrtllm + +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ llama_ifb -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,guided_decoding_backend:${GUIDED_DECODING_BACKEND},xgrammar_tokenizer_info_path:${XGRAMMAR_TOKENIZER_INFO_PATH} + +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ +``` +# Sending Guided Decoding Requests + +Use the provided gRPC client to send requests with different guide types. +```bash +# Set the prompt +PROMPT="What is the year after 2024? Answer:" + +# 0. Guide type: None +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble + +# Output: +# 0: 2025 +# +# Question 3: What is the year after 2025? Answer: 2026 +# + +# 1. Guide type: json +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type json + +# Output: +# 0: [2025] + +# 2. Guide type: json_schema +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type json_schema --guided-decoding-guide '{"properties": {"answer": {"title": "Answer", "type": "integer"}}, "required": ["answer"], "title": "Answer", "type": "object"}' + +# Output: +# 0: {"answer": 2026} + +# 3. Guide type: regex +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type regex --guided-decoding-guide '\d+' + +# Output: +# 0: 2025 + +# 4. Guide type: ebnf_grammar +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type ebnf_grammar --guided-decoding-guide 'root ::= [0-9]+' + +# Output: +# 0: 2025 +``` + +Use curl method to send requests +```bash +curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is the year after 2024? Answer:", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2, "guided_decoding_guide_type":"json"}' + +# Output: +# {"model_name":"ensemble","model_version":"1","sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"[2025]"} +``` diff --git a/docs/llama.md b/docs/llama.md new file mode 100644 index 00000000..4187274b --- /dev/null +++ b/docs/llama.md @@ -0,0 +1,380 @@ +## End to end workflow to run llama 7b + +0. Make sure that you have initialized the TRT-LLM submodule: + +```bash +git lfs install +git submodule update --init --recursive +``` + +1. (Optional) Download the LLaMa model from HuggingFace: + +```bash +huggingface-cli login + +huggingface-cli download meta-llama/Llama-2-7b-hf +``` + +> **NOTE** +> +> Make sure that you have access to https://huggingface.co/meta-llama/Llama-2-7b-hf. + +2. Start the Triton Server Docker container: + +```bash +# Replace with the version of Triton you want to use. +# The command below assumes the the current directory is the +# TRT-LLM backend root git repository. + +docker run --rm -ti -v `pwd`:/mnt -w /mnt -v ~/.cache/huggingface:~/.cache/huggingface --gpus all nvcr.io/nvidia/tritonserver:\-trtllm-python-py3 bash +``` + +3. Build the engine: +```bash +# Replace 'HF_LLAMA_MODE' with another path if you didn't download the model from step 1 +# or you're not using HuggingFace. +export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json')).parent)"` +export UNIFIED_CKPT_PATH=/tmp/ckpt/llama/7b/ +export ENGINE_PATH=/tmp/engines/llama/7b/ +python tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + +trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --remove_input_padding enable \ + --gpt_attention_plugin float16 \ + --context_fmha enable \ + --gemm_plugin float16 \ + --output_dir ${ENGINE_PATH} \ + --kv_cache_type paged \ + --max_batch_size 64 +``` + +* Prepare configs + +```bash +cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 +``` + +* Launch server + +```bash +pip install SentencePiece +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ +``` + +this setting requires about 25GB + +```bash +nvidia-smi + +Wed Nov 29 08:51:30 2023 ++---------------------------------------------------------------------------------------+ +| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 | +|-----------------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+======================+======================| +| 0 NVIDIA H100 PCIe On | 00000000:41:00.0 Off | 0 | +| N/A 40C P0 79W / 350W | 25169MiB / 81559MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+----------------------+----------------------+ + ++---------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=======================================================================================| ++---------------------------------------------------------------------------------------+ +``` + +* Send request + +```bash +curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2}' + +{"cum_log_probs":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\nMachine learning is a subset of artificial intelligence (AI) that uses algorithms to learn from data and"} +``` + +* Send request with bad_words and stop_words + +```bash +curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": ["intelligence", "model"], "stop_words": ["focuses", "learn"], "pad_id": 2, "end_id": 2}' + +{"cum_log_probs":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\nMachine learning is a subset of artificial Intelligence (AI) that allows computers to learn"} +``` + +* Send request by `inflight_batcher_llm_client.py` + +```bash +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL} + +========= +[[1, 19298, 297, 6641, 29899, 23027, 3444, 29892, 1105, 7598, 16370, 408, 263]] +Got completed request +Input: Born in north-east France, Soyer trained as a +Output beam 0: 850. He was the first chef to be hired by the newly opened Delmonico’s restaurant, where he worked for 10 years. He then opened his own restaurant, which was a huge success. +Soyer was a prolific writer and his books include The Gastronomic Regenerator (1854), The Gastronomic Regenerator and Cookery for the People (1855), The Cuisine of To-day (1859), The Cuisine of To-morrow (1864), The Cuisine of the Future (1867), The Cuisine of the Future (1873), The Cuisine of the Future (1874), The Cuisine of the Future (1875), The Cuisine of the Future (1876), The +output_ids = [14547, 297, 3681, 322, 4517, 1434, 8401, 304, 1570, 3088, 297, 29871, 29896, 29947, 29945, 29900, 29889, 940, 471, 278, 937, 14547, 304, 367, 298, 2859, 491, 278, 15141, 6496, 5556, 3712, 1417, 30010, 29879, 27144, 29892, 988, 540, 3796, 363, 29871, 29896, 29900, 2440, 29889, 940, 769, 6496, 670, 1914, 27144, 29892, 607, 471, 263, 12176, 2551, 29889, 13, 6295, 7598, 471, 263, 410, 29880, 928, 9227, 322, 670, 8277, 3160, 450, 402, 7614, 4917, 293, 2169, 759, 1061, 313, 29896, 29947, 29945, 29946, 511, 450, 402, 7614, 4917, 293, 2169, 759, 1061, 322, 17278, 708, 363, 278, 11647, 313, 29896, 29947, 29945, 29945, 511, 450, 315, 4664, 457, 310, 1763, 29899, 3250, 313, 29896, 29947, 29945, 29929, 511, 450, 315, 4664, 457, 310, 1763, 29899, 26122, 313, 29896, 29947, 29953, 29946, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29953, 29955, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29941, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29946, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29945, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29953, 511, 450] +``` + +* Run test on dataset + +``` +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 + +[INFO] Start testing on 13 prompts. +[INFO] Functionality test succeed. +[INFO] Warm up for benchmarking. +[INFO] Start benchmarking on 13 prompts. +[INFO] Total Latency: 962.179 ms +``` + + + +* Run with decoupled mode (streaming) + +```bash +cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:Truelogits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + +pip install SentencePiece +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ + +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL} --streaming +``` + +
+ The result would be like + + +```bash +========= +Input sequence: [1, 19298, 297, 6641, 29899, 23027, 3444, 29892, 1105, 7598, 16370, 408, 263] +[14547] +[297] +[3681] +[322] +[4517] +[1434] +[8401] +[304] +[1570] +[3088] +[297] +[29871] +[29896] +[29947] +[29945] +[29900] +[29889] +[940] +[471] +[278] +[937] +[14547] +[304] +[367] +[298] +[2859] +[491] +[278] +[15141] +[6496] +[5556] +[3712] +[1417] +[30010] +[29879] +[27144] +[29892] +[988] +[540] +[3796] +[363] +[29871] +[29896] +[29900] +[2440] +[29889] +[940] +[769] +[6496] +[670] +[1914] +[27144] +[29892] +[607] +[471] +[263] +[12176] +[2551] +[29889] +[13] +[6295] +[7598] +[471] +[263] +[410] +[29880] +[928] +[9227] +[322] +[670] +[8277] +[3160] +[450] +[402] +[7614] +[4917] +[293] +[2169] +[759] +[1061] +[313] +[29896] +[29947] +[29945] +[29946] +[511] +[450] +[402] +[7614] +[4917] +[293] +[2169] +[759] +[1061] +[322] +[17278] +[708] +[363] +[278] +[11647] +[313] +[29896] +[29947] +[29945] +[29945] +[511] +[450] +[315] +[4664] +[457] +[310] +[1763] +[29899] +[3250] +[313] +[29896] +[29947] +[29945] +[29929] +[511] +[450] +[315] +[4664] +[457] +[310] +[1763] +[29899] +[26122] +[313] +[29896] +[29947] +[29953] +[29946] +[511] +[450] +[315] +[4664] +[457] +[310] +[278] +[16367] +[313] +[29896] +[29947] +[29953] +[29955] +[511] +[450] +[315] +[4664] +[457] +[310] +[278] +[16367] +[313] +[29896] +[29947] +[29955] +[29941] +[511] +[450] +[315] +[4664] +[457] +[310] +[278] +[16367] +[313] +[29896] +[29947] +[29955] +[29946] +[511] +[450] +[315] +[4664] +[457] +[310] +[278] +[16367] +[313] +[29896] +[29947] +[29955] +[29945] +[511] +[450] +[315] +[4664] +[457] +[310] +[278] +[16367] +[313] +[29896] +[29947] +[29955] +[29953] +[511] +[450] +Input: Born in north-east France, Soyer trained as a +Output beam 0: chef in Paris and London before moving to New York in 1850. He was the first chef to be hired by the newly opened Delmonico’s restaurant, where he worked for 10 years. He then opened his own restaurant, which was a huge success. +Soyer was a prolific writer and his books include The Gastronomic Regenerator (1854), The Gastronomic Regenerator and Cookery for the People (1855), The Cuisine of To-day (1859), The Cuisine of To-morrow (1864), The Cuisine of the Future (1867), The Cuisine of the Future (1873), The Cuisine of the Future (1874), The Cuisine of the Future (1875), The Cuisine of the Future (1876), The +Output sequence: [1, 19298, 297, 6641, 29899, 23027, 3444, 29892, 1105, 7598, 16370, 408, 263, 14547, 297, 3681, 322, 4517, 1434, 8401, 304, 1570, 3088, 297, 29871, 29896, 29947, 29945, 29900, 29889, 940, 471, 278, 937, 14547, 304, 367, 298, 2859, 491, 278, 15141, 6496, 5556, 3712, 1417, 30010, 29879, 27144, 29892, 988, 540, 3796, 363, 29871, 29896, 29900, 2440, 29889, 940, 769, 6496, 670, 1914, 27144, 29892, 607, 471, 263, 12176, 2551, 29889, 13, 6295, 7598, 471, 263, 410, 29880, 928, 9227, 322, 670, 8277, 3160, 450, 402, 7614, 4917, 293, 2169, 759, 1061, 313, 29896, 29947, 29945, 29946, 511, 450, 402, 7614, 4917, 293, 2169, 759, 1061, 322, 17278, 708, 363, 278, 11647, 313, 29896, 29947, 29945, 29945, 511, 450, 315, 4664, 457, 310, 1763, 29899, 3250, 313, 29896, 29947, 29945, 29929, 511, 450, 315, 4664, 457, 310, 1763, 29899, 26122, 313, 29896, 29947, 29953, 29946, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29953, 29955, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29941, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29946, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29945, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29953, 511, 450] +``` + +
+ + +* Run several requests at the same time + +```bash +echo '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2}' > tmp.txt +printf '%s\n' {1..20} | xargs -I % -P 20 curl -X POST localhost:8000/v2/models/ensemble/generate -d @tmp.txt +``` diff --git a/docs/llama_multi_instance.md b/docs/llama_multi_instance.md new file mode 100644 index 00000000..5afdebd7 --- /dev/null +++ b/docs/llama_multi_instance.md @@ -0,0 +1,326 @@ + + +# Running Multiple Instances of the LLaMa Model + +This document describes how you can run multiple instances of +LLaMa model on single and multiple GPUs running on the +same machine. The guide focuses on the following scenarios: + +* [Running multiple instances of LLaMa model on a single GPU](#running-multiple-instances-of-llama-model-on-a-single-gpu). +* [Running multiple instances of LLaMa model on multiple GPUs](#running-multiple-instances-of-llama-model-on-multiple-gpus): + + a. Using [Orchestrator mode](#orchestrator-mode). + + b. Using [Leader mode](#leader-mode). + +## Running multiple instances of LLaMa model on a single GPU + +1. Setup the model repository as described in [LLaMa Guide](./llama.md). + +2. Increase the number of instances for the `instance_group` parameter for +the `tensorrt_llm` model. + +3. Start the triton server: + +```bash +# Replace the with the gpu you want to use for this model. +CUDA_VISIBLE_DEVICES= tritonserver --model-repository `pwd`/llama_ifb & +``` + +This would create multiple instances of the `tensorrt_llm` model, running on the +same GPU. + +> **Note** +> +> Running multiple instances of a single model is generally not +> recommended. If you choose to do this, you need to ensure the GPU has enough +> resources for multiple copies of a single model. The performance implications +> of running multiple models on the same GPU are unpredictable. + +> **Note** +> +> For production deployments please make sure to adjust the +> `max_tokens_in_paged_kv_cache` parameter, otherwise you may run out of GPU +> memory since TensorRT-LLM by default may use 90% of GPU for KV-Cache for each +> model instance. Additionally, if you rely on `kv_cache_free_gpu_mem_fraction` +> the memory allocated to each instance will depend on the order in which instances are loaded. + +4. Run the test client to measure performance: + +```bash +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 +``` + +If you plan to use the BLS version instead of the ensemble model, you might also +need to adjust the number of model instances for the `tensorrt_llm_bls` model. +The default value only allows a single request for the whole pipeline which +might increase the latency and reduce the throughput. + +5. Kill the server: + +```bash +pgrep tritonserver | xargs kill +``` + +## Running multiple instances of LLaMa model on multiple GPUs + +Unlike other Triton backend models, the TensorRT-LLM backend does not support +using `instance_group` setting for determining the placement of model instances +on different GPUs. In this section, we demonstrate how you can use +[Leader Mode](../README.md#leader-mode) and [Orchestrator Mode](../README.md#orchestrator-mode) +for running multiple instances of a LLaMa model on different GPUs. + +For this section, let's assume that we have four GPUs and the CUDA device ids +are 0, 1, 2, and 3. We will be launching two instances of the LLaMa2-7b model +with tensor parallelism equal to 2. The first instance will run on GPUs 0 and 1 +and the second instance will run on GPUs 2 and 3. + +1. Create the engines: + +```bash +# Update if the model is not available in huggingface cache +export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json')).parent)"` + +export UNIFIED_CKPT_PATH=/tmp/ckpt/llama/7b-2tp-2gpu/ +export ENGINE_PATH=/tmp/engines/llama/7b-2tp-2gpu/ + +# Create the checkpoint +python tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 \ + --tp_size 2 + +# Build the engines +trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --remove_input_padding enable \ + --gpt_attention_plugin float16 \ + --context_fmha enable \ + --gemm_plugin float16 \ + --output_dir ${ENGINE_PATH} \ + --kv_cache_type paged \ + --max_batch_size 64 +``` + +2. Setup the model repository: + +```bash +# Setup the model repository for the first instance. +cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 +``` + +### Leader Mode + +For leader mode, we will launch two separate `mpirun` commands to launch two +separate Triton servers, one for each GPU (4 Triton Server instances in total). +We also need to use a reverse proxy in front of them to load balance the requests +between the servers. + +3a. Launch the servers: + +```bash +CUDA_VISIBLE_DEVICES=0,1 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004 +CUDA_VISIBLE_DEVICES=2,3 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005 +``` + +4a. Install NGINX: + +```bash +apt update +apt install nginx -y +``` + +5a. Setup the NGINX configuration and store it in `/etc/nginx/sites-available/tritonserver`: + +```conf +upstream tritonserver { + server localhost:8000; + server localhost:8002; +} + +server { + listen 8080; + + location / { + proxy_pass http://tritonserver; + } +} +``` + +6a. Create a symlink and restart NGINX to enable the configuration: + +``` +ln -s /etc/nginx/sites-available/tritonserver /etc/nginx/sites-enabled/tritonserver +service nginx restart +``` + +7a. Run the test client to measure performance: + +```bash +pip3 install tritonclient[all] + +# Test the load on all the servers +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080 + +# Test the load on one of the servers +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000 +``` + +8a. Kill the server: + +```bash +pgrep mpirun | xargs kill +``` + +### Orchestrator Mode + +With orchestrator mode, there are two options for running multiple instances +of a single model: + +1. Creating separate Triton models + +2. Starting from the 24.08 release, you can use Triton `instance_group` field to specify the number TRT-LLM model instances. With that option, the load balancing decision will be done in Triton core. + +#### 1. Creating Separate Triton Models + +3b. Create a copy of the `tensorrt_llm` model: + +```bash +cp llama_ifb/tensorrt_llm llama_ifb/tensorrt_llm_2 -r +``` + +4b. Modify the `gpu_device_ids` field in the config file to specify which GPUs +should be used by each model: + +```bash +sed -i 's/\${gpu_device_ids}/0,1/g' llama_ifb/tensorrt_llm/config.pbtxt +sed -i 's/\${gpu_device_ids}/2,3/g' llama_ifb/tensorrt_llm_2/config.pbtxt +sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_2"/g' llama_ifb/tensorrt_llm_2/config.pbtxt +``` + +> **Note** +> +> If you want to use the ensemble or BLS models, you have to create a +> copy of the ensemble and BLS models as well and modify the "tensorrt_llm" +> model name to "tensorrt_llm_2" in the config file. + +5b. Launch the server: + +```bash +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/ +``` + +Alternatively, you can start all MPI ranks at once and avoid dynamic process spawning +by using the `--disable-spawn-processes`. The config file must specify which ranks each +model should use: + +```bash +sed -i 's/\${participant_ids}/1,2/g' llama_ifb/tensorrt_llm/config.pbtxt +sed -i 's/\${participant_ids}/3,4/g' llama_ifb/tensorrt_llm_2/config.pbtxt +``` + +Note that rank 0 is reserved for the orchestrator rank. + +```bash +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/ --disable-spawn-processes --world_size=5 +``` + +6b. Run the test client to measure performance: + +```bash +pip3 install tritonclient[all] + +# We will only benchmark the core tensorrtllm models. +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \ + --tensorrt-llm-model-name tensorrt_llm \ + --tensorrt-llm-model-name tensorrt_llm_2 \ + dataset --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json \ + --tokenizer-dir $HF_LLAMA_MODEL +``` + +7b. Kill the server: + +```bash +pgrep mpirun | xargs kill +``` + +#### 2. Using Triton Core's Load Balancing + +In order to use Triton core's load balancing for multiple instances, you can +increase the number of instances in the `instance_group` field and use the +`gpu_device_ids` parameter to specify which GPUs will be used by each model +instance. + +For example, if you're running a TP=2 model on a 4-GPU system and you want +to run one instance on GPUs 0 and 1 and the other instance on GPUs 2 and 3, +you can use the following model configuration: + +``` +instance_group [ + {kind: KIND_CPU, count: 2} +] + +parameters: { + key: "gpu_device_ids" + value: { + string_value: "0,1;2,3" + } +} +``` + +Please note that the number of set of GPU device ids must equal the number of instances. + +### Orchestrator Mode vs Leader Mode Summary + +The table below summarizes the differences between the orchestrator mode and +leader mode: + +| | Orchestrator Mode (Separate Models) | Orchestrator Mode (Triton Load Balancing) |Leader Mode | +| ----------------------------------| :----------------: | :----------------: |:----------:| +| Requires Reverse Proxy | ❌ | ❌ | ✅ | +| Requires Client Changes | ✅ | ❌ | ❌ | + +Orchestrator mode by default uses `MPI_Comm_Spawn` to create the child +processes. If `MPI_Comm_Spawn` is used, it is not possible to distribute +the model across multiple nodes. + +It is also possible to use orchestrator mode with MPI processes that have been +pre-spawned. In order to do that, you need to set `--disable-spawn-processes` +when using the [launch_triton_server.py](../scripts/launch_triton_server.py) +script or `export TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES=0`. In this mode, +it is possible to run the server across different nodes in orchestrator mode. + +In order to use the orchestrator mode itself, you need to set the `--multi-model` +flag when using the [launch_triton_server.py](../scripts/launch_triton_server.py) +script or `export TRTLLM_ORCHESTRATOR=1`. diff --git a/docs/llmapi.md b/docs/llmapi.md new file mode 100644 index 00000000..cd21f26a --- /dev/null +++ b/docs/llmapi.md @@ -0,0 +1,64 @@ +## End to end workflow to use the pytorch LLMAPI workflow + +* Start the Triton Server Docker container: + +```bash +# Replace with the version of Triton you want to use. +# The command below assumes the the current directory is the +# TRT-LLM backend root git repository. + +docker run --rm -ti -v `pwd`:/mnt -w /mnt -v ~/.cache/huggingface:~/.cache/huggingface --gpus all nvcr.io/nvidia/tritonserver:\-trtllm-python-py3 bash +``` + +* Prepare config + +```bash + cp -R tensorrt_llm/triton_backend/all_models/llmapi/ llmapi_repo/ +``` + +Edit `llmapi_repo/tensorrt_llm/1/model.yaml` to change the model. You can either use a HuggingFace path or a local path. The following is based on `meta-llama/Llama-3.1-8B`. + +This configuration file also allows you to enable CUDA graphs support and set pipeline parallelism and tensor parallelism sizes. + +* Launch server + +```bash +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --model_repo=llmapi_repo/ +``` + +* Send request + +```bash +curl -X POST localhost:8000/v2/models/tensorrt_llm/generate -d '{"text_input": "The future of AI is", "max_tokens":10}' | jq +``` + +`inflight_batcher_llm_client.py` is not supported yet. + +* Run test on dataset + +```bash +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 --test-llmapi --model-name tensorrt_llm + +[INFO] Start testing on 13 prompts. +[INFO] Functionality test succeeded. +[INFO] Warm up for benchmarking. +FLAGS.model_name: tensorrt_llm +[INFO] Start benchmarking on 13 prompts. +[INFO] Total Latency: 377.254 ms +``` + +* Run benchmark + +```bash + python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \ + --tensorrt-llm-model-name tensorrt_llm \ + --test-llmapi \ + dataset --dataset ./tensorrt_llm/triton_backend/tools/dataset/mini_cnn_eval.json \ + --tokenizer-dir meta-llama/Llama-3.1-8B + +dataset +Tokenizer: Tokens per word = 1.308 +[INFO] Warm up for benchmarking. +[INFO] Start benchmarking on 39 prompts. +[INFO] Total Latency: 1446.623 ms +``` diff --git a/docs/lora.md b/docs/lora.md new file mode 100644 index 00000000..d2478efa --- /dev/null +++ b/docs/lora.md @@ -0,0 +1,269 @@ +# Running LoRA inference with inflight batching + +Below is an example of how to run LoRA inference with inflight batching. See the +[LoRA documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/lora.md) +in the TensorRT-LLM repository for more information about running gpt-2b with +LoRA using inflight batching. + +## Launch Triton TensorRT-LLM container + +```bash +docker run --rm -it --net host --shm-size=2g \ + --ulimit memlock=-1 --ulimit stack=67108864 --gpus all \ + -v :/tensorrtllm_backend \ + -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ + -v :/engines \ + nvcr.io/nvidia/tritonserver:-trtllm-python-py3 +``` + +## Prepare TensorRT-LLM engines with LoRA enable + +(Optional) Download the LLaMa model from HuggingFace if you haven't already. + +```bash +huggingface-cli login +huggingface-cli download meta-llama/Llama-2-7b-hf +``` + +> **NOTE** +> +> Make sure that you have access to https://huggingface.co/meta-llama/Llama-2-7b-hf. + +```bash +cd /tensorrtllm_backend/tensorrt_llm/examples/llama +BASE_LLAMA_MODEL=/path/to/llama-7b-hf + +python3 convert_checkpoint.py --model_dir ${BASE_LLAMA_MODEL} \ + --output_dir ./c-model/llama/fp16/1-gpu \ + --dtype float16 + +trtllm-build --checkpoint_dir ./c-model/llama/fp16/1-gpu \ + --output_dir /engines/llama_7b_with_lora_qkv/fp16/1-gpu \ + --gemm_plugin float16 \ + --max_batch_size 8 \ + --max_seq_len 562 \ + --gpt_attention_plugin float16 \ + --kv_cache_type paged \ + --remove_input_padding enable \ + --use_paged_context_fmha enable \ + --lora_plugin float16 \ + --lora_target_modules attn_q attn_k attn_v \ + --max_lora_rank 8 +``` + +Note that you still need to use `hf_lora_convert.py` to convert the lora weights and store in `/tmp/lora_prefetch`. But users don't need to send the `--lora-path` when you run the inference at the first time. + +## Generate LoRA tensors + +Now generate LoRA tensors that will be passed in with each request to triton. + +```bash +git-lfs clone https://huggingface.co/qychen/luotuo-lora-7b-0.1 +git-lfs clone https://huggingface.co/kunishou/Japanese-Alpaca-LoRA-7b-v0 + +python3 ..//hf_lora_convert.py -i luotuo-lora-7b-0.1 -o luotuo-lora-7b-0.1-weights --storage-type float16 +python3 ../hf_lora_convert.py -i Japanese-Alpaca-LoRA-7b-v0 -o Japanese-Alpaca-LoRA-7b-v0-weights --storage-type float16 +``` + +## Create a Triton model repository and launch the Triton server + +Create a Triton model repository following the instructions +[here](../README.md#prepare-the-model-repository), and modify the model +configuration following the steps +[here](../README.md#modify-the-model-configuration). + +## LoRA Cache + +As LoRA weights are passed to the backend they will be cached in a host cache. +As requests are scheduled, those weights with be prefetched to a gpu cache. +After a LoRA is loaded into the cache, only `lora_task_id` is needed for inference. + +### lora_cache_optimal_adapter_size + +Optimal adapter size used to size cache pages. Typically optimally sized +adapters will fix exactly into 1 cache page. (default: 8) + +``` +parameters: { + key: "lora_cache_optimal_adapter_size" + value: { + string_value: "${lora_cache_optimal_adapter_size}" + } +} +``` + +### lora_cache_max_adapter_size + +Used to set the minimum size of a cache page. Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default: 64) + +``` +parameters: { + key: "lora_cache_max_adapter_size" + value: { + string_value: "${lora_cache_max_adapter_size}" + } +} +``` + +### lora_cache_gpu_memory_fraction + +Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded (default: 0.05) + +``` +parameters: { + key: "lora_cache_gpu_memory_fraction" + value: { + string_value: "${lora_cache_gpu_memory_fraction}" + } +} +``` + +### lora_cache_host_memory_bytes + +Size of host LoRA cache in bytes (default: 1G) + +``` +parameters: { + key: "lora_cache_host_memory_bytes" + value: { + string_value: "${lora_cache_host_memory_bytes}" + } +} +``` + +### prefetch lora cache during initializing the model instance + +If users want to load the lora models during initializing the model instance, +instead of passing the lora weight as input, users can store the lora weights in `` +and pass it as a parameter to initialize the model instance. +Then, the model instance will try to load the lora weights from the folder. +In the folder, users can put many folders for different lora tasks. +For example, assume we want to store lora weights in `/tmp/lora_prefetch` and +there are three lora tasks `0`, `1` and `3`, then the architecture of the folder would be like + +```bash +/tmp/lora_prefetch +├── 0 +│ ├── model.lora_config.npy +│ └── model.lora_weights.npy +├── 1 +│ ├── model.lora_config.npy +│ └── model.lora_weights.npy +└── 3 + ├── model.lora_config.npy + └── model.lora_weights.npy +``` + +Note that you must name the folder by digit because the lora cache manager will view these name as lora task ids. + +```pbtxt +parameters: { + key: "lora_prefetch_dir" + value: { + string_value: "${lora_prefetch_dir}" + } +} +``` + +## Launch tritonserver + +```bash +MODEL_FOLDER=/path/to/triton_model_repo +# 'world_size' is the number of GPUs you want to use for serving. This should +# be aligned with the number of GPUs used to build the TensorRT-LLM engine. +python3 /tensorrtllm_backend/tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size=1 --model_repo=${MODEL_FOLDER} +``` + +Run Multi-LoRA example by issuing multiple concurrent requests. +The inflight batcher will execute mixed batches with multiple LoRAs in the same batch. + +First we cache the LoRAs by sending dummy requests for each adapter. The TASK_IDS are uniq to the adapter + +```bash +pip3 install tritonclient[all] + +TASK_IDS=("1" "2") +LORA_PATHS=("luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights") +INFLIGHT_BATCHER_LLM_CLIENT=/tensorrtllm_backend/tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py + +for index in ${!TASK_IDS[@]}; do + text="dummy" + lora_path=${LORA_PATHS[$index]} + task_id=${TASK_IDS[$index]} + lora_arg="--lora-path ${lora_path} --lora-task-id ${task_id}" + + python3 ${INFLIGHT_BATCHER_LLM_CLIENT} \ + --top-k 0 \ + --top-p 0.5 \ + --request-output-len 10 \ + --text "${text}" \ + --tokenizer-dir /path/to/llama-7b-hf \ + ${lora_arg} & +done +``` + +Now perform inference with just `--lora-task-id` + +```bash +INPUT_TEXT=("美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:") +TASK_IDS=("" "1" "2" "" "1" "2") + +for index in ${!INPUT_TEXT[@]}; do + text=${INPUT_TEXT[$index]} + task_id=${TASK_IDS[$index]} + lora_arg="" + if [ "${task_id}" != "" ]; then + lora_arg="--lora-task-id ${task_id}" + fi + + python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \ + --top-k 0 \ + --top-p 0.5 \ + --request-output-len 10 \ + --text "${text}" \ + --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \ + ${lora_arg} & +done + +wait +``` + +Example Output: + +``` +Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] +Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] +Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] +Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] +Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] +Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] +Got completed request +Input: アメリカ合衆国の首都はどこですか? \n答え: +Output beam 0: ワシントン D.C. +Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 29871, 31028, 30373, 30203, 30279, 30203, 360, 29889, 29907, 29889] +Got completed request +Input: 美国的首都在哪里? \n答案: +Output beam 0: Washington, D.C. +What is the +Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 5618, 338, 278] +Got completed request +Input: 美国的首都在哪里? \n答案: +Output beam 0: Washington D.C. +Washington D. +Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 360, 29889, 29907, 29889, 13, 29956, 7321, 360, 29889] +Got completed request +Input: アメリカ合衆国の首都はどこですか? \n答え: +Output beam 0: Washington, D.C. +Which of +Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 8809, 436, 310] +Got completed request +Input: アメリカ合衆国の首都はどこですか? \n答え: +Output beam 0: Washington D.C. +1. ア +Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 360, 29889, 29907, 29889, 13, 29896, 29889, 29871, 30310] +Got completed request +Input: 美国的首都在哪里? \n答案: +Output beam 0: 华盛顿 +W +Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 29871, 31266, 234, 158, 158, 236, 164, 194, 13, 29956] +``` diff --git a/docs/model_config.md b/docs/model_config.md new file mode 100644 index 00000000..e95ebb8b --- /dev/null +++ b/docs/model_config.md @@ -0,0 +1,376 @@ +# Model Configuration + +## Model Parameters + +The following tables show the parameters in the `config.pbtxt` of the models in +[all_models/inflight_batcher_llm](https://github.com/NVIDIA/TensorRT-LLM/blob/main/triton_backend/all_models/inflight_batcher_llm). +that can be modified before deployment. For optimal performance or custom +parameters, please refer to +[perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md). + +The names of the parameters listed below are the values in the `config.pbtxt` +that can be modified using the +[`fill_template.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/triton_backend/tools/fill_template.py) script. + +**NOTE** For fields that have comma as the value (e.g. `gpu_device_ids`, +`participant_ids`), you need to escape the comma with +a backslash. For example, if you want to set `gpu_device_ids` to `0,1` you need +to run `python3 fill_template.py -i config.pbtxt "gpu_device_ids:0\,1".` + +The mandatory parameters must be set for the model to run. The optional +parameters are not required but can be set to customize the model. + +### ensemble model + +See +[here](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models) +to learn more about ensemble models. + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that the Triton model instance will run with. Note that for the `tensorrt_llm` model, the actual runtime batch size can be larger than `triton_max_batch_size`. The runtime batch size will be determined by the TRT-LLM scheduler based on a number of parameters such as number of available requests in the queue, and the engine build `trtllm-build` parameters (such `max_num_tokens` and `max_batch_size`). | +| `logits_datatype` | The data type for context and generation logits. | + +### preprocessing model + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that Triton should use with the model. | +| `tokenizer_dir` | The path to the tokenizer for the model. | +| `preprocessing_instance_count` | The number of instances of the model to run. | +| `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. | +| `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. | + +*Optional parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `add_special_tokens` | The `add_special_tokens` flag used by [HF tokenizers](https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.add_special_tokens). | +| `multimodal_model_path` | The vision engine path used in multimodal workflow. | +| `engine_dir` | The path to the engine for the model. This parameter is only needed for *multimodal processing* to extract the `vocab_size` from the engine_dir's config.json for `fake_prompt_id` mappings. | + + +### multimodal_encoders model + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that Triton should use with the model. | +| `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. | +| `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. | +| `multimodal_model_path` | The vision engine path used in multimodal workflow. | +| `hf_model_path` | The Huggingface model path used for `llava_onevision` and `mllama` models. | + + +### postprocessing model + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that Triton should use with the model. | +| `tokenizer_dir` | The path to the tokenizer for the model. | +| `postprocessing_instance_count` | The number of instances of the model to run. | + +*Optional parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `skip_special_tokens` | The `skip_special_tokens` flag used by [HF detokenizers](https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.decode). | + +### tensorrt_llm model + +The majority of the `tensorrt_llm` model parameters and input/output tensors +can be mapped to parameters in the TRT-LLM C++ runtime API defined in +[`executor.h`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/executor/executor.h). +Please refer to the Doxygen comments in `executor.h` for a more detailed +description of the parameters below. + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_backend` | The backend to use for the model. Set to `tensorrtllm` to utilize the C++ TRT-LLM backend implementation. Set to `python` to utlize the TRT-LLM Python runtime. | +| `triton_max_batch_size` | The maximum batch size that the Triton model instance will run with. Note that for the `tensorrt_llm` model, the actual runtime batch size can be larger than `triton_max_batch_size`. The runtime batch size will be determined by the TRT-LLM scheduler based on a number of parameters such as number of available requests in the queue, and the engine build `trtllm-build` parameters (such `max_num_tokens` and `max_batch_size`). | +| `decoupled_mode` | Whether to use decoupled mode. Must be set to `true` for requests setting the `stream` tensor to `true`. | +| `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. | +| `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. | +| `engine_dir` | The path to the engine for the model. | +| `batching_strategy` | The batching strategy to use. Set to `inflight_fused_batching` when enabling in-flight batching support. To disable in-flight batching, set to `V1` | +| `encoder_input_features_data_type` | The dtype for the input tensor `encoder_input_features`. For the mllama model, this must be `TYPE_BF16`. For other models like whisper, this is `TYPE_FP16`. | +| `logits_datatype` | The data type for context and generation logits. | + +*Optional parameters* + +- General + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `encoder_engine_dir` | When running encoder-decoder models, this is the path to the folder that contains the model configuration and engine for the encoder model. | +| `max_attention_window_size` | When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. (default=max_sequence_length) | +| `sink_token_length` | Number of sink tokens to always keep in attention window. | +| `exclude_input_in_output` | Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens. (default=`false`) | +| `cancellation_check_period_ms` | The time for cancellation check thread to sleep before doing the next check. It checks if any of the current active requests are cancelled through triton and prevent further execution of them. (default=100) | +| `stats_check_period_ms` | The time for the statistics reporting thread to sleep before doing the next check. (default=100) | +| `recv_poll_period_ms` | The time for the receiving thread in orchestrator mode to sleep before doing the next check. (default=0) | +| `iter_stats_max_iterations` | The maximum number of iterations for which to keep statistics. (default=ExecutorConfig::kDefaultIterStatsMaxIterations) | +| `request_stats_max_iterations` | The maximum number of iterations for which to keep per-request statistics. (default=executor::kDefaultRequestStatsMaxIterations) | +| `normalize_log_probs` | Controls if log probabilities should be normalized or not. Set to `false` to skip normalization of `output_log_probs`. (default=`true`) | +| `gpu_device_ids` | Comma-separated list of GPU IDs to use for this model. Use semicolons to separate multiple instances of the model. If not provided, the model will use all visible GPUs. (default=unspecified) | +| `participant_ids` | Comma-separated list of MPI ranks to use for this model. Mandatory when using orchestrator mode with -disable-spawn-process (default=unspecified) | +| `num_nodes` | Number of MPI nodes to use for this model. (default=1) | +| `gpu_weights_percent` | Set to a number between 0.0 and 1.0 to specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime. Values less than 1.0 are only supported for an engine built with `weight_streaming` on. (default=1.0) | + +- KV cache + +Note that the parameter `enable_trt_overlap` has been removed from the +config.pbtxt. This option allowed to overlap execution of two micro-batches to +hide CPU overhead. Optimization work has been done to reduce the CPU overhead +and it was found that the overlapping of micro-batches did not provide +additional benefits. + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `max_tokens_in_paged_kv_cache` | The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. (default=unspecified) | +| `kv_cache_free_gpu_mem_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache. (default=0.9) | +| `cross_kv_cache_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of KV cache that may be used for cross attention, and the rest will be used for self attention. Optional param and should be set for encoder-decoder models ONLY. (default=0.5) | +| `kv_cache_host_memory_bytes` | Enable offloading to host memory for the given byte size. | +| `enable_kv_cache_reuse` | Set to `true` to reuse previously computed KV cache values (e.g. for system prompt) | + +- LoRA cache + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `lora_cache_optimal_adapter_size` | Optimal adapter size used to size cache pages. Typically optimally sized adapters will fix exactly into 1 cache page. (default=8) | +| `lora_cache_max_adapter_size` | Used to set the minimum size of a cache page. Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default=64) | +| `lora_cache_gpu_memory_fraction` | Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded. (default=0.05) | +| `lora_cache_host_memory_bytes` | Size of host LoRA cache in bytes. (default=1G) | +| `lora_prefetch_dir` | Folder to store the LoRA weights we hope to load during engine initialization. | + +- Decoding mode + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `max_beam_width` | The beam width value of requests that will be sent to the executor. (default=1) | +| `decoding_mode` | Set to one of the following: `{top_k, top_p, top_k_top_p, beam_search, medusa, redrafter, lookahead, eagle}` to select the decoding mode. The `top_k` mode exclusively uses Top-K algorithm for sampling, The `top_p` mode uses exclusively Top-P algorithm for sampling. The top_k_top_p mode employs both Top-K and Top-P algorithms, depending on the runtime sampling params of the request. Note that the `top_k_top_p option` requires more memory and has a longer runtime than using `top_k` or `top_p` individually; therefore, it should be used only when necessary. `beam_search` uses beam search algorithm. If not specified, the default is to use `top_k_top_p` if `max_beam_width == 1`; otherwise, `beam_search` is used. When Medusa model is used, `medusa` decoding mode should be set. However, TensorRT-LLM detects loaded Medusa model and overwrites decoding mode to `medusa` with warning. Same applies to the ReDrafter, Lookahead and Eagle. | + +- Optimization + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `enable_chunked_context` | Set to `true` to enable context chunking. (default=`false`) | +| `multi_block_mode` | Set to `false` to disable multi block mode. (default=`true`) | +| `enable_context_fmha_fp32_acc` | Set to `true` to enable FMHA runner FP32 accumulation. (default=`false`) | +| `cuda_graph_mode` | Set to `true` to enable cuda graph. (default=`false`) | +| `cuda_graph_cache_size` | Sets the size of the CUDA graph cache, in numbers of CUDA graphs. (default=0) | + +- Scheduling + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `batch_scheduler_policy` | Set to `max_utilization` to greedily pack as many requests as possible in each current in-flight batching iteration. This maximizes the throughput but may result in overheads due to request pause/resume if KV cache limits are reached during execution. Set to `guaranteed_no_evict` to guarantee that a started request is never paused. (default=`guaranteed_no_evict`) | + +- Medusa + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `medusa_choices` | To specify Medusa choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, `mc_sim_7b_63` choices are used. | + +- Eagle + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `eagle_choices` | To specify default per-server Eagle choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, `mc_sim_7b_63` choices are used. | + +- Guided decoding + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `guided_decoding_backend` | Set to `xgrammar` to activate guided decoder. | +| `tokenizer_dir` | The guided decoding of tensorrt_llm python backend requires tokenizer's information. | +| `xgrammar_tokenizer_info_path` | The guided decoding of tensorrt_llm C++ backend requires xgrammar's tokenizer's info in 'json' format. | + +### tensorrt_llm_bls model + +See +[here](https://github.com/triton-inference-server/python_backend#business-logic-scripting) +to learn more about BLS models. + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that the model can handle. | +| `decoupled_mode` | Whether to use decoupled mode. | +| `bls_instance_count` | The number of instances of the model to run. When using the BLS model instead of the ensemble, you should set the number of model instances to the maximum batch size supported by the TRT engine to allow concurrent request execution. | +| `logits_datatype` | The data type for context and generation logits. | + +*Optional parameters* + +- General + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `accumulate_tokens` | Used in the streaming mode to call the postprocessing model with all accumulated tokens, instead of only one token. This might be necessary for certain tokenizers. | + +- Speculative decoding + +The BLS model supports speculative decoding. Target and draft triton models are set with the parameters `tensorrt_llm_model_name` `tensorrt_llm_draft_model_name`. Speculative decodingis performed by setting `num_draft_tokens` in the request. `use_draft_logits` may be set to use logits comparison speculative decoding. Note that `return_generation_logits` and `return_context_logits` are not supported when using speculative decoding. Also note that requests with batch size greater than 1 is not supported with speculative decoding right now. + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `tensorrt_llm_model_name` | The name of the TensorRT-LLM model to use. | +| `tensorrt_llm_draft_model_name` | The name of the TensorRT-LLM draft model to use. | + +### Model Input and Output + +Below is the lists of input and output tensors for the `tensorrt_llm` and +`tensorrt_llm_bls` models. + +#### Common Inputs + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `end_id` | [1] | `int32` | End token ID. If not specified, defaults to -1 | +| `pad_id` | [1] | `int32` | Padding token ID | +| `temperature` | [1] | `float32` | Sampling Config param: `temperature` | +| `repetition_penalty` | [1] | `float` | Sampling Config param: `repetitionPenalty` | +| `min_length` | [1] | `int32_t` | Sampling Config param: `minLength` | +| `presence_penalty` | [1] | `float` | Sampling Config param: `presencePenalty` | +| `frequency_penalty` | [1] | `float` | Sampling Config param: `frequencyPenalty` | +| `random_seed` | [1] | `uint64_t` | Sampling Config param: `randomSeed` | +| `return_log_probs` | [1] | `bool` | When `true`, include log probs in the output | +| `return_context_logits` | [1] | `bool` | When `true`, include context logits in the output | +| `return_generation_logits` | [1] | `bool` | When `true`, include generation logits in the output | +| `num_return_sequences` | [1] | `int32_t` | Number of generated sequences per request. (Default=1) | +| `beam_width` | [1] | `int32_t` | Beam width for this request; set to 1 for greedy sampling (Default=1) | +| `prompt_embedding_table` | [1] | `float16` (model data type) | P-tuning prompt embedding table | +| `prompt_vocab_size` | [1] | `int32` | P-tuning prompt vocab size | +| `return_perf_metrics` | [1] | `bool` | When `true`, include perf metrics in the output, such as kv cache reuse stats | +| `guided_decoding_guide_type` | [1] | `string` | Guided decoding param: `guide_type` | +| `guided_decoding_guide` | [1] | `string` | Guided decoding param: `guide` | + +The following inputs for lora are for both `tensorrt_llm` and `tensorrt_llm_bls` +models. The inputs are passed through the `tensorrt_llm` model and the +`tensorrt_llm_bls` model will refer to the inputs from the `tensorrt_llm` model. + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `lora_task_id` | [1] | `uint64` | The unique task ID for the given LoRA. To perform inference with a specific LoRA for the first time, `lora_task_id`, `lora_weights`, and `lora_config` must all be given. The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. If the cache is full, the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached | +| `lora_weights` | [ num_lora_modules_layers, D x Hi + Ho x D ] | `float` (model data type) | Weights for a LoRA adapter. See the config file for more details. | +| `lora_config` | [ num_lora_modules_layers, 3] | `int32t` | Module identifier. See the config file for more details. | + +#### Common Outputs + +Note: the timing metrics oputputs are represented as the number of nanoseconds since epoch. + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `cum_log_probs` | [-1] | `float` | Cumulative probabilities for each output | +| `output_log_probs` | [beam_width, -1] | `float` | Log probabilities for each output | +| `context_logits` | [-1, vocab_size] | `float` | Context logits for input | +| `generation_logits` | [beam_width, seq_len, vocab_size] | `float` | Generation logits for each output | +| `batch_index` | [1] | `int32` | Batch index | +| `kv_cache_alloc_new_blocks` | [1] | `int32` | KV cache reuse metrics. Number of newly allocated blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_alloc_new_blocks` in the outputs. | +| `kv_cache_reused_blocks` | [1] | `int32` | KV cache reuse metrics. Number of reused blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_reused_blocks` in the outputs. | +| `kv_cache_alloc_total_blocks` | [1] | `int32` | KV cache reuse metrics. Number of total allocated blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_alloc_total_blocks` in the outputs. | +| `arrival_time_ns` | [1] | `float` | Time when the request was received by TRT-LLM. Set the optional input `return_perf_metrics` to `true` to include `arrival_time_ns` in the outputs. | +| `first_scheduled_time_ns` | [1] | `float` | Time when the request was first scheduled. Set the optional input `return_perf_metrics` to `true` to include `first_scheduled_time_ns` in the outputs. | +| `first_token_time_ns` | [1] | `float` | Time when the first token was generated. Set the optional input `return_perf_metrics` to `true` to include `first_token_time_ns` in the outputs. | +| `last_token_time_ns` | [1] | `float` | Time when the last token was generated. Set the optional input `return_perf_metrics` to `true` to include `last_token_time_ns` in the outputs. | +| `acceptance_rate` | [1] | `float` | Acceptance rate of the speculative decoding model. Set the optional input `return_perf_metrics` to `true` to include `acceptance_rate` in the outputs. | +| `total_accepted_draft_tokens` | [1] | `int32` | Number of tokens accepted by the target model in speculative decoding. Set the optional input `return_perf_metrics` to `true` to include `total_accepted_draft_tokens` in the outputs. | +| `total_draft_tokens` | [1] | `int32` | Maximum number of draft tokens acceptable by the target model in speculative decoding. Set the optional input `return_perf_metrics` to `true` to include `total_draft_tokens` in the outputs. | + +#### Unique Inputs for tensorrt_llm model + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `input_ids` | [-1] | `int32` | Input token IDs | +| `input_lengths` | [1] | `int32` | Input lengths | +| `request_output_len` | [1] | `int32` | Requested output length | +| `draft_input_ids` | [-1] | `int32` | Draft input IDs | +| `decoder_input_ids` | [-1] | `int32` | Decoder input IDs | +| `decoder_input_lengths` | [1] | `int32` | Decoder input lengths | +| `draft_logits` | [-1, -1] | `float32` | Draft logits | +| `draft_acceptance_threshold` | [1] | `float32` | Draft acceptance threshold | +| `stop_words_list` | [2, -1] | `int32` | List of stop words | +| `bad_words_list` | [2, -1] | `int32` | List of bad words | +| `embedding_bias` | [-1] | `string` | Embedding bias words | +| `runtime_top_k` | [1] | `int32` | Top-k value for runtime top-k sampling | +| `runtime_top_p` | [1] | `float32` | Top-p value for runtime top-p sampling | +| `runtime_top_p_min` | [1] | `float32` | Minimum value for runtime top-p sampling | +| `runtime_top_p_decay` | [1] | `float32` | Decay value for runtime top-p sampling | +| `runtime_top_p_reset_ids` | [1] | `int32` | Reset IDs for runtime top-p sampling | +| `len_penalty` | [1] | `float32` | Controls how to penalize longer sequences in beam search (Default=0.f) | +| `early_stopping` | [1] | `bool` | Enable early stopping | +| `beam_search_diversity_rate` | [1] | `float32` | Beam search diversity rate | +| `stop` | [1] | `bool` | Stop flag | +| `streaming` | [1] | `bool` | Enable streaming | + +#### Unique Outputs for tensorrt_llm model + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `output_ids` | [-1, -1] | `int32` | Output token IDs | +| `sequence_length` | [-1] | `int32` | Sequence length | + +#### Unique Inputs for tensorrt_llm_bls model + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `text_input` | [-1] | `string` | Prompt text | +| `decoder_text_input` | [1] | `string` | Decoder input text | +| `image_input` | [3, 224, 224] | `float16` | Input image | +| `max_tokens` | [-1] | `int32` | Number of tokens to generate | +| `bad_words` | [2, num_bad_words] | `int32` | Bad words list | +| `stop_words` | [2, num_stop_words] | `int32` | Stop words list | +| `top_k` | [1] | `int32` | Sampling Config param: `topK` | +| `top_p` | [1] | `float32` | Sampling Config param: `topP` | +| `length_penalty` | [1] | `float32` | Sampling Config param: `lengthPenalty` | +| `stream` | [1] | `bool` | When `true`, stream out tokens as they are generated. When `false` return only when the full generation has completed (Default=`false`) | +|`embedding_bias_words` | [-1] | `string` | Embedding bias words | +| `embedding_bias_weights` | [-1] | `float32` | Embedding bias weights | +| `num_draft_tokens` | [1] | `int32` | Number of tokens to get from draft model during speculative decoding | +| `use_draft_logits` | [1] | `bool` | Use logit comparison during speculative decoding | + +#### Unique Outputs for tensorrt_llm_bls model + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `text_output` | [-1] | `string` | Text output | + +## Some tips for model configuration + +Below are some tips for configuring models for optimal performance. These +recommendations are based on our experiments and may not apply to all use cases. +For guidance on other parameters, please refer to the +[perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md). + +- **Setting the `instance_count` for models to better utilize inflight batching** + + The `instance_count` parameter in the config.pbtxt file specifies the number + of instances of the model to run. Ideally, this should be set to match the + maximum batch size supported by the TRT engine, as this allows for concurrent + request execution and reduces performance bottlenecks. However, it will also + consume more CPU memory resources. While the optimal value isn't something we + can determine in advance, it generally shouldn't be set to a very small + value, such as 1. + For most use cases, we have found that setting `instance_count` to 5 works + well across a variety of workloads in our experiments. + +- **Adjusting `max_batch_size` and `max_num_tokens` to optimize inflight batching** + + `max_batch_size` and `max_num_tokens` are important parameters for optimizing + inflight batching. You can modify `max_batch_size` in the model configuration + file, while `max_num_tokens` is set during the conversion to a TRT-LLM engine + using the `trtllm-build` command. Tuning these parameters is necessary for + different scenarios, and experimentation is currently the best approach to + finding optimal values. Generally, the total number of requests should be + lower than `max_batch_size`, and the total tokens should be less than + `max_num_tokens`. diff --git a/docs/multimodal.md b/docs/multimodal.md new file mode 100755 index 00000000..057bc3bf --- /dev/null +++ b/docs/multimodal.md @@ -0,0 +1,422 @@ +# End to end workflow to run a Multimodal model + +### Support Matrix +The following multimodal model is supported in tensorrtllm_backend: +* BLIP2-OPT +* LLAVA +* VILA +* LLaVA OneVision +* MLLAMA +* Qwen2-VL + +For more multimodal models supported in TensorRT-LLM, please visit [TensorRT-LLM multimodal examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal). + +## Run Multimodal with single-GPU Tritonserver +### Tritonserver setup steps +0. Make sure that you have initialized the TRT-LLM submodule: + + ```bash + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend + git lfs install + git submodule update --init --recursive + ``` + +1. Start the Triton Server Docker container: + + 1-1. If you're using Tritonserver from nvcr.io + ```bash + # Replace with the version of Triton you want to use. + # The command below assumes the the current directory is the + # TRT-LLM backend root git repository. + + docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all nvcr.io/nvidia/tritonserver:\-trtllm-python-py3 bash + ``` + 1-2. If you are using `tensorrtllm_backend` container: + ```bash + docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all triton_trt_llm + ``` + +2. Build the engine: + + 2-1. Clone the target model repository + ```bash + # For BLIP-OPT2 + export MODEL_NAME="blip2-opt-2.7b" + git clone https://huggingface.co/Salesforce/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + # For LLAVA + export MODEL_NAME="llava-1.5-7b-hf" + git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + # For VILA + pip install -r all_models/multimodal/requirements-vila.txt + + export MODEL_NAME="vila1.5-3b" + git clone https://huggingface.co/Efficient-Large-Model/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + export VILA_PATH="tmp/hf_models/VILA" + git clone https://github.com/Efficient-Large-Model/VILA.git ${VILA_PATH} + + # For LLaVA OneVision + pip install -r all_models/multimodal/requirements-llava-onevision.txt + + export MODEL_NAME="llava-onevision-qwen2-7b-ov-hf" + git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + # For MLLAMA + pip install -r all_models/multimodal/requirements-mllama.txt + + export MODEL_NAME="Llama-3.2-11B-Vision" + git clone https://huggingface.co/meta-llama/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + # For Qwen2-VL + pip install -r all_models/multimodal/requirements-qwen2vl.txt + + export MODEL_NAME="Qwen2-VL-7B-Instruct" + git clone https://huggingface.co/Qwen/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + export + ``` + 2-2. Build TensorRT-LLM engines + ```bash + export HF_MODEL_PATH=tmp/hf_models/${MODEL_NAME} + export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp16/1-gpu + export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu + export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder + + # For BLIP-OPT2 + python tensorrt_llm/examples/opt/convert_checkpoint.py --model_type blip2 \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin float16 \ + --max_beam_width 1 \ + --max_batch_size 8 \ + --max_seq_len 1024 \ + --max_input_len 924 \ + --max_multimodal_len 256 # 8 (max_batch_size) * 32 (num_multimodal_features) for BLIP2 + + python tensorrt_llm/examples/multimodal/build_multimodal_engine.py --model_type blip2 --model_path ${HF_MODEL_PATH} --max_batch_size 8 + + # For LLAVA + python tensorrt_llm/examples/llama/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin float16 \ + --max_batch_size 8 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 4608 # 8 (max_batch_size) * 576 (num_multimodal_features) for LLaVA + + python tensorrt_llm/examples/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava --max_batch_size 8 + + # For VILA + python tensorrt_llm/examples/llama/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin float16 \ + --max_batch_size 8 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 6272 # 8 (max_batch_size) * 196 (num_multimodal_features) * 4 (max_num_images_per_request) + + python tensorrt_llm/examples/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type vila --vila_path ${VILA_PATH} --max_batch_size 32 #max_batch_size * max_num_images_per_request since vila support multiple images inference + + # For LLaVA OneVision + python tensorrt_llm/examples/qwen/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin float16 \ + --max_batch_size 1 \ + --max_input_len 7500 \ + --max_seq_len 7600 \ + --max_multimodal_len 7300 # max_batch_size * num_multimodal_features(depends on the image size or the specified video num frame) + + python tensorrt_llm/examples/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava_onevision --max_batch_size 16 # max_batch_size * patch for image or frame for video + + # For MLLAMA + python tensorrt_llm/examples/mllama/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype bfloat16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin auto \ + --max_batch_size 8 \ + --max_seq_len 2048 \ + --max_num_tokens 4096 \ + --max_encoder_input_len 6404 + + python tensorrt_llm/examples/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type mllama --output_dir ${MULTIMODAL_ENGINE_PATH} --max_batch_size 8 #max_batch_size * max_num_images_per_request + + # For Qwen2-VL + python3 ../qwen/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin=float16 \ + --gpt_attention_plugin=float16 \ + --max_batch_size 4 \ + --max_input_len 2048 \ + --max_seq_len 3072 \ + --max_multimodal_len 1296 #(max_batch_size) * 324 (num_multimodal_features), this's for image_shape=[504,504] + + python build_multimodal_engine.py --model_type qwen2_vl --model_path tmp/hf_models/${MODEL_NAME} --output_dir ${MULTIMODAL_ENGINE_PATH} + ``` + + > **NOTE**: + > + > `max_multimodal_len = max_batch_size * num_multimodal_features`, so if you change `max_batch_size`, `max_multimodal_len` **MUST** be changed accordingly. + > For multi-image inference, where a single request could contain multiple images, `max_multimodal_len = max_batch_size * num_multimodal_features * max_num_images_per_request` + > + > The built visual engines are located in `tmp/trt_engines/${MODEL_NAME}/multimodal_encoder`. + +3. Prepare Tritonserver configs + + ```bash + cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ multimodal_ifb -r + # Override the ensemble and creates new multimodal_encoders directories for multimodal + cp tensorrt_llm/triton_backend/all_models/multimodal/ensemble multimodal_ifb -r + cp tensorrt_llm/triton_backend/all_models/multimodal/multimodal_encoders multimodal_ifb -r + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:8,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32,cross_kv_cache_fraction:0.5 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,preprocessing_instance_count:1,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${ENGINE_PATH},max_num_images:1,max_queue_delay_microseconds:20000 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,postprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/ensemble/config.pbtxt triton_max_batch_size:8,logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:8,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,tensorrt_llm_model_name:tensorrt_llm,multimodal_encoders_name:multimodal_encoders,logits_datatype:TYPE_FP32 + + # Newly added for multimodal + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/multimodal_encoders/config.pbtxt triton_max_batch_size:8,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},hf_model_path:${HF_MODEL_PATH},max_queue_delay_microseconds:20000 + ``` + > **NOTE**: + > + > You can set the `decoupled_mode` option to True to use streaming mode. + > + > You can set the `accumulate_tokens` option to True in streaming mode to call the postprocessing model with all accumulated tokens. + > + > You can set the `enable_kv_cache_reuse` option to True to enable kv cache reuse. Requests with the same image/prompt table/input tokens will reuse the KV cache, which will help reduce latency. The specific performance improvement depends on the length of reuse. + > + > You can set the `max_num_images` to the max number of images per request. The value should be the same as the `max_num_images_per_request` value used at build the engine step above. + > + > Set `${ENCODER_INPUT_FEATURES_DTYPE}` to `TYPE_BF16` for mllama, and `TYPE_FP16` for other models. + > `cross_kv_cache_fraction` is used to determine the paged kv cache memory pool size of enc-dec models. For such case, we distinguish `free_fraction * (1 - cross_kv_cache_fraction)` to self attention kv caches, and `free_fraction * cross_kv_cache_fraction` to cross attention kv caches. + +4. Launch Tritonserver + + ```bash + python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=multimodal_ifb/ --tensorrt_llm_model_name tensorrt_llm,multimodal_encoders --multimodal_gpu0_cuda_mem_pool_bytes 300000000 + ``` + + > **NOTE**: + > If there is an error associated with 'MPI_Init_thread', please do `export PMIX_MCA_gds=hash`' + > + > When launching the server, since the prompt_embedding_table is in GPU memory, we need to set the CUDA pool memory for inter-step communication. For example, when we have a shape of (1, 576, 4096) promp_embedding table, we would need 300MB of CUDA pool memory, so we set 30MB to have some GPU buffers. (2(fp16=>2bytes) * 576 * 4096 * 8(max_batch_size) = 18,874,368) + > + > Also, the tensorrt_llm initialization assumes using another GPU, we need to initialize it but not use them. + +### Send requests +1. Send request with `decoupled_mode` set to False + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 + + [beam 0 ]: + Question: which city is this? Answer: singapore + [INFO] Latency: 41.942 ms + ``` +2. Send request with `decoupled_mode` set to True + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --streaming + + [beam 0 ]: sing + [beam 0 ]: apore + [beam 0 ]: + [INFO] Latency: 43.441 ms + ``` +3. Send request to the `tensorrt_llm_bls` model + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --use_bls + + [beam 0 ]: + Question: which city is this? Answer: singapore + [INFO] Latency: 44.152 ms + ``` + +4. Send request to the `tensorrt_llm_bls` model with `accumulate_tokens` set to True + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --use_bls --streaming + + [beam 0 ]: sing + [beam 0 ]: singapore + [beam 0 ]: singapore + [INFO] Latency: 45.48 ms + ``` + +5. Send request with `enable_kv_cache_reuse` set to True + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --prompt_table_extra_id ${id} + + [beam 0 ]: + Question: which city is this? Answer: singapore + [INFO] Latency: 42.514 ms + ``` +6. Send request with multiple images per request + ```bash + wget -O av.png https://raw.githubusercontent.com/Efficient-Large-Model/VILA/main/demo_images/av.png + + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text '\n\n Please elaborate what you see in the images?' --image av.png,'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 68 --model_type vila --hf_model_dir ${HF_MODEL_PATH} + + [beam 0 ]: + A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n \n Please elaborate what you see in the images? ASSISTANT: The first image shows a busy street scene with a car driving through a crosswalk, surrounded by pedestrians and traffic lights. The second image captures a beautiful sunset with the iconic Merlion statue spouting water into the bay, with the Singapore Flyer and the city skyline in the background. + + [INFO] Latency: 403.879 ms + ``` + +7. Send request with curl + The triton server supports curl requests with an image url in the payload. For example here is a request sent to a Llama-3.2-11B-Vision (mLLama) model: + ``` bash + curl -X POST localhost:8000/v2/models/ensemble/generate_stream \ + -d '{"id": "42", "text_input": "<|image|>If I had to write a haiku for this one", "image_url_input": "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png", "parameters": {"max_tokens": 16, "beam_width": 1, "end_id": 128001, "pad_id": 128004, "top_k": 1, "top_p": 0, "stream": false, "temperature": 0}}' + + # response + data: {"batch_index":0,"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"id":"42","model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_index":0,"sequence_start":false,"text_output":"If I had to write a haiku for this one, it would be:.\\nMerlion spouts water.\\nMarina"} + ``` + You can also send requests with base64 encoded images. Just replace the url above with `data:image/jpeg;base64,`. + +8. Send request with video input + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text "Why is this video funny?" --video sample_demo_1.mp4 --video_num_frames 8 --request-output-len 30 --model_type llava_onevision --end-id 151645 + + [beam 0 ]: + user + Why is this video funny?assistant + The video is funny because the child's actions are playful and exaggerated, as if they are reading the book with great enthusiasm. + [INFO] Latency: 507.537 ms + ``` + +> **NOTE**: +> Please ignore any exception thrown with the output. It's a known issue to be fixed. +> +> When `enable_kv_cache_reuse` is set to true, the `prompt_table_extra_id` must be specified in the requests. The `prompt_table_extra_id` is a unique identifier representing the image (or prompt table), the same image uses the same id. The data type is `uint64`, and the minimum value is 1. + +### Kill the server +```bash +pkill tritonserver +``` + +### Supported image input types +When programmatically preparing your own request for the server, note that `ensemble`: +- `image_input`: a float16 5D tensor of shape `[batch_size, num_images, num_channels, height, width]` or `[batch_size, num_images, height, width, num_channels]` representing a batch of images already processed (via transformers AutoProcessor) for the vision encoder. +- `image_bytes_input`: a uint8 5D tensor of shape `[batch_size, num_images, num_channels, height, width]` or `[batch_size, num_images, height, width, num_channels]` representing a batch of raw images. +- `image_url_input`: a list of strings of shape `[batch_size, num_images]` representing a batch of image urls. + +You may populate only one of these image inputs in a request. We suggest you use `image_bytes_input` when using grpc requests and `image_url_input` when sending http requests. For grpc requests where the client can preprocess images to reduce load on the server, use `image_input`. Note that `tensorrt_llm_bls` only supports `image_input`. + +### Long multimodal context, FP8 KV cache and tensor parallelism + +Follow these steps to enable chunked context inference (using LLaVA as an example) with FP8 KV cache and 2-way tensor parallelism. Ensure you convert the checkpoint using `--tp_size 2` and build the model with `--use_paged_context_fmha enable` and `--use_fp8_context_fmha enable`. Set the chunked context to true in the Tritonserver configuration file. The chunk size is determined by the `max_num_tokens` flag when building the engine, which defaults to 8192. When launching the server, you need to change the `--world_size` to match your tensor parallelism size. +1. Build the engine +```bash + export MODEL_NAME="llava-1.5-7b-hf" + export HF_MODEL_PATH=tmp/hf_models/${MODEL_NAME} + + # Convert checkpoint + # For fp16 KV cache + export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp8/2-gpu + export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp8/2-gpu + export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder + python tensorrt_llm/examples/llama/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 \ + --tp_size 2 + + # For fp8 KV cache + export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp8/2-gpu + export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp8/2-gpu + export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder + python ./tensorrt_llm/examples/quantization/quantize.py \ + --model_dir ${HF_MODEL_PATH} \ + --dtype float16 \ + --qformat fp8 \ + --kv_cache_dtype fp8 \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --calib_size 512 \ + --tp_size 2 + + # Build the llm engine + # --use_paged_context_fmha and --use_fp8_context_fmha are defaultly enabled + # include --max_num_tokens to set the chunk size + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin auto \ + --max_batch_size 8 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 4608 # 8 (max_batch_size) * 576 (num_multimodal_features) for LLaVA + + # Build the multimodal engine + python tensorrt_llm/examples/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava --max_batch_size 8 --output_dir ${MULTIMODAL_ENGINE_PATH} +``` +2. Prepare the Tritonserver config file +Prepare the Tritonserver config file with `enable_chunked_context` set to True. Also, to further utilize the free memory, we can set `kv_cache_free_gpu_mem_fraction` to 0.9. +```bash +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ multimodal_ifb -r +# Override the ensemble and creates new multimodal_encoders directories for multimodal +cp tensorrt_llm/triton_backend/all_models/multimodal/ensemble multimodal_ifb -r +cp tensorrt_llm/triton_backend/all_models/multimodal/multimodal_encoders multimodal_ifb -r + +# Changes the enable_chunked_context to True, and set kv_cache_free_gpu_mem_fraction to 0.9 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:8,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:True,encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32,kv_cache_free_gpu_mem_fraction:0.9 + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,preprocessing_instance_count:1,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${ENGINE_PATH},max_num_images:1,max_queue_delay_microseconds:20000 + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,postprocessing_instance_count:1 + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/ensemble/config.pbtxt triton_max_batch_size:8,logits_datatype:TYPE_FP32 + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:8,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,tensorrt_llm_model_name:tensorrt_llm,multimodal_encoders_name:multimodal_encoders,logits_datatype:TYPE_FP32 + +# Newly added for multimodal +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/multimodal_encoders/config.pbtxt triton_max_batch_size:8,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},hf_model_path:${HF_MODEL_PATH},max_queue_delay_microseconds:20000 +``` +3. Launch the server +```bash +# Change --world_size to your tp size +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=multimodal_ifb/ --tensorrt_llm_model_name tensorrt_llm,multimodal_encoders --multimodal_gpu0_cuda_mem_pool_bytes 300000000 +``` + +When you launch the server, you will see logs similar to the following. In theory, now you can process long multimodal context up to the "max tokens in paged KV cache" value, and the context prefill phase will be done in chunk sizes. +```bash +[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 93.10 GiB, available: 85.57 GiB +... +[TensorRT-LLM][INFO] [MemUsageChange] Allocated 77.02 GiB for max tokens in paged KV cache (315488). +``` diff --git a/docs/whisper.md b/docs/whisper.md new file mode 100644 index 00000000..29f33af0 --- /dev/null +++ b/docs/whisper.md @@ -0,0 +1,142 @@ +# End to end workflow to run a Multimodal model + +### Support Matrix +The following multimodal model is supported in tensorrtllm_backend: +* Whisper +* Distil-Whisper + +## Run Whisper with single-GPU Tritonserver +### Tritonserver setup steps +0. Make sure that you have initialized the TRT-LLM submodule: + + ```bash + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend + git lfs install + git submodule update --init --recursive + ``` + +1. Start the Triton Server Docker container: + + 1-1. If you're using Tritonserver from nvcr.io + ```bash + # Replace with the version of Triton you want to use. + # The command below assumes the the current directory is the + # TRT-LLM backend root git repository. + + docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all nvcr.io/nvidia/tritonserver:\-trtllm-python-py3 bash + ``` + 1-2. If you are using `tensorrtllm_backend` container: + ```bash + docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all triton_trt_llm + ``` + +2. Build the engine: + + 2-1. Download the whisper models + ```bash + wget --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken + wget --directory-prefix=assets assets/mel_filters.npz https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz + wget --directory-prefix=assets https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav + # take large-v3 model as an example + wget --directory-prefix=assets https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt + ``` + 2-2. Build TensorRT-LLM engines + ```bash + INFERENCE_PRECISION=float16 + MAX_BEAM_WIDTH=4 + MAX_BATCH_SIZE=64 + checkpoint_dir=tllm_checkpoint + output_dir=whisper_large_v3_max_batch_${MAX_BATCH_SIZE} + + python3 convert_checkpoint.py --model_dir ${MODEL_DIR} --output_dir ${checkpoint_dir} + + trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \ + --output_dir ${output_dir}/encoder \ + --moe_plugin disable \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --gemm_plugin disable \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --max_input_len 3000 --max_seq_len=3000 + + trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ + --output_dir ${output_dir}/decoder \ + --moe_plugin disable \ + --max_beam_width ${MAX_BEAM_WIDTH} \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --max_seq_len 114 \ + --max_input_len 14 \ + --max_encoder_input_len 3000 \ + --gemm_plugin ${INFERENCE_PRECISION} \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --gpt_attention_plugin ${INFERENCE_PRECISION} + + ``` + + > **NOTE**: + > + > TensorRT-LLM also supports using [distil-whisper's](https://github.com/huggingface/distil-whisper) different models by first converting their params and weights from huggingface's naming format to [openai whisper](https://github.com/openai/whisper) naming format. You can do so by running the script [distil_whisper/convert_from_distil_whisper.py](./convert_from_distil_whisper.py). + +3. Prepare Tritonserver configs + + ```bash + cp tensorrt_llm/triton_backend/all_models/whisper/ model_repo_whisper -r + cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm model_repo_whisper -r + wget --directory-prefix=model_repo_whisper/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken + wget --directory-prefix=model_repo_whisper/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz + + BACKEND=tensorrtllm + DECOUPLED_MODE=false + DECODER_ENGINE_PATH=${output_dir}/decoder + ENCODER_ENGINE_PATH=${output_dir}/encoder + MAX_TOKENS_IN_KV_CACHE=24000 + BATCHING_STRATEGY=inflight_fused_batching + KV_CACHE_FREE_GPU_MEM_FRACTION=0.5 + EXCLUDE_INPUT_IN_OUTPUT=True + TRITON_MAX_BATCH_SIZE=8 + MAX_QUEUE_DELAY_MICROSECONDS=0 + MAX_BEAM_WIDTH=1 + MAX_QUEUE_SIZE="0" + ENABLE_KV_CACHE_REUSE=false + ENABLE_CHUNKED_CONTEXT=false + CROSS_KV_CACHE_FRACTION="0.5" + n_mels=128 + zero_pad=false + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i model_repo_whisper/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${DECODER_ENGINE_PATH},encoder_engine_dir:${ENCODER_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},enable_context_fmha_fp32_acc:${ENABLE_CONTEXT_FMHA_FP32_ACC},cross_kv_cache_fraction:${CROSS_KV_CACHE_FRACTION},encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i model_repo_whisper/whisper_bls/config.pbtxt engine_dir:${ENCODER_ENGINE_PATH},n_mels:$n_mels,zero_pad:$zero_pad,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE} + ``` + > **NOTE**: + > + > TODO: You can set the `decoupled_mode` option to True to use streaming mode. + +4. Launch Tritonserver + + ```bash + python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=model_repo_whisper/ --tensorrt_llm_model_name tensorrt_llm,whisper_bls --multimodal_gpu0_cuda_mem_pool_bytes 300000000 + ``` + +### Send requests +1. Send request with a single audio file + ```bash + wget -nc https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav + # Test non-streaming + python3 tensorrt_llm/triton_backend/whisper/client.py --audio-path 1221-135766-0002.wav + ``` +2. Send requests with a whole audio dataset + ```bash + git clone https://github.com/yuekaizhang/Triton-ASR-Client.git + cd Triton-ASR-Client + num_task=16 + python3 tensorrt_llm/triton_backend/whisper/client.py \ + --server-addr localhost \ + --model-name whisper_bls \ + --num-tasks $num_task \ + --text-prompt "<|startoftranscript|><|zh|><|transcribe|><|notimestamps|>" \ + --manifest-dir ./datasets/aishell1_test \ + --compute-cer + ``` +### Kill the server +```bash +pkill tritonserver +``` diff --git a/images/leader-mode.png b/images/leader-mode.png new file mode 100644 index 00000000..dedc5999 Binary files /dev/null and b/images/leader-mode.png differ diff --git a/images/orchestrator-mode.png b/images/orchestrator-mode.png new file mode 100644 index 00000000..f603f50d Binary files /dev/null and b/images/orchestrator-mode.png differ diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt deleted file mode 100644 index 941f2ade..00000000 --- a/inflight_batcher_llm/CMakeLists.txt +++ /dev/null @@ -1,367 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: * -# Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. * Redistributions in binary -# form must reproduce the above copyright notice, this list of conditions and -# the following disclaimer in the documentation and/or other materials provided -# with the distribution. * Neither the name of NVIDIA CORPORATION nor the names -# of its contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESS -# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -# EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, -# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -cmake_minimum_required(VERSION 3.17) -include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/set_ifndef.cmake) - -set(TRITON_BUILD - OFF - CACHE STRING "Using Triton build process") - -if(TRITON_BUILD) - set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm) - # Install build time dependencies. This section is excuted during cmake - # configure time. - execute_process( - COMMAND bash -x ./tools/environment_setup.sh - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - RESULT_VARIABLE CMD_RESULT) - if(NOT CMD_RESULT EQUAL "0") - message(FATAL_ERROR "Failed to install build time dependencies") - endif() -else() - set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm) -endif() - -include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake) - -project(tritontensorrtllmbackend LANGUAGES C CXX) - -# -# Options -# -# Must include options required for this project as well as any projects -# included in this one by FetchContent. -# -# TRITON_ENABLE_GPU is set to OFF as currently the code does not use any GPU -# related features since TRT-LLM backend manages the usage on GPUs itself. -option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF) -option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) - -# The TRTLLM_BUILD_CONTAINER is used to compile the TRT-LLM libraries that are -# needed for the TRT-LLM backend. The TRTLLM_BUILD_CONTAINER is launched -# separately, and the artifacts will be copied back to the backend installation -# directory. -if(TRITON_BUILD) - set(TRTLLM_BUILD_CONTAINER - "" - CACHE STRING "Base image for building TRT-LLM") -endif() - -set(TRITON_COMMON_REPO_TAG - "main" - CACHE STRING "Tag for triton-inference-server/common repo") -set(TRITON_CORE_REPO_TAG - "main" - CACHE STRING "Tag for triton-inference-server/core repo") -set(TRITON_BACKEND_REPO_TAG - "main" - CACHE STRING "Tag for triton-inference-server/backend repo") - -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release) -endif() - -set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDA_PATH}/include) -message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}") - -# -# Dependencies -# -# FetchContent requires us to include the transitive closure of all repos that -# we depend on so that we can override the tags. -# -include(FetchContent) - -FetchContent_Declare( - repo-common - GIT_REPOSITORY https://github.com/triton-inference-server/common.git - GIT_TAG ${TRITON_COMMON_REPO_TAG} - GIT_SHALLOW ON) -FetchContent_Declare( - repo-core - GIT_REPOSITORY https://github.com/triton-inference-server/core.git - GIT_TAG ${TRITON_CORE_REPO_TAG} - GIT_SHALLOW ON) -FetchContent_Declare( - repo-backend - GIT_REPOSITORY https://github.com/triton-inference-server/backend.git - GIT_TAG ${TRITON_BACKEND_REPO_TAG} - GIT_SHALLOW ON) -FetchContent_MakeAvailable(repo-common repo-core repo-backend) - -# Compile TRT-LLM -if(TRITON_BUILD) - set(TRITON_TRTLLM_DOCKER_NAME "tritonserver-trtllm") - add_custom_command( - OUTPUT tensorrt_llm_build - COMMENT "Building TensorRT-LLM" - COMMAND - cd ${CMAKE_CURRENT_SOURCE_DIR} && python3 tools/gen_trtllm_dockerfile.py - --trtllm-build-config="${CMAKE_BUILD_TYPE}" - --trtllm-base-image="${TRTLLM_BUILD_CONTAINER}" --output=Dockerfile.trtllm - COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && docker build --no-cache -t - ${TRITON_TRTLLM_DOCKER_NAME} -f ./Dockerfile.trtllm . - COMMAND docker rm trtllm_build || echo 'error ignored...' || true - COMMAND docker create --name trtllm_build ${TRITON_TRTLLM_DOCKER_NAME} - COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && rm -fr tensorrt_llm - COMMAND cd ${CMAKE_CURRENT_SOURCE_DIR} && docker cp - trtllm_build:/app/tensorrt_llm tensorrt_llm - COMMAND docker cp trtllm_build:/opt/trtllm_lib trtllm_build - COMMAND docker rm trtllm_build) -endif() - -# -# The backend must be built into a shared library. Use an ldscript to hide all -# symbols except for the TRITONBACKEND API. -# -configure_file(src/libtriton_tensorrtllm.ldscript - libtriton_tensorrtllm.ldscript COPYONLY) -add_library(triton-tensorrt-llm-backend SHARED src/libtensorrtllm.cc) - -if(TRITON_BUILD) - add_custom_target(trtllm_target DEPENDS tensorrt_llm_build) - add_dependencies(triton-tensorrt-llm-backend trtllm_target) -endif() - -add_library(TritonTensorRTLLMBackend::triton-tensorrt-llm-backend ALIAS - triton-tensorrt-llm-backend) - -enable_language(CUDA) - -find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED) - -find_library( - CUDNN_LIB cudnn - HINTS ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR} - PATH_SUFFIXES lib64 lib) -find_library( - CUBLAS_LIB cublas - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib64 lib lib/stubs) -find_library( - CUBLASLT_LIB cublasLt - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib64 lib lib/stubs) -find_library( - CUDART_LIB cudart - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib lib64) -find_library( - CUDA_DRV_LIB cuda - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs) -set(CUDA_LIBRARIES ${CUDART_LIB}) - -find_package(MPI REQUIRED) -message(STATUS "Using MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}") -message(STATUS "Using MPI_LIBRARIES: ${MPI_LIBRARIES}") - -# NCCL dependencies -set_ifndef(NCCL_LIB_DIR /usr/lib/x86_64-linux-gnu/) -set_ifndef(NCCL_INCLUDE_DIR /usr/include/) -find_library(NCCL_LIB nccl HINTS ${NCCL_LIB_DIR}) - -# TRT_LIB_DIR and TRT_INCLUDE_DIR should be aligned with the path in the -# environment_setup.sh script -set_ifndef(TRT_LIB_DIR - /usr/local/tensorrt/targets/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/lib) -set_ifndef(TRT_INCLUDE_DIR /usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu) - -set(TRT_LIB nvinfer) -find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR}) -find_library_create_target(nvuffparser nvparsers SHARED ${TRT_LIB_DIR}) - -file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS - REGEX "#define NV_TENSORRT_.*") -foreach(TYPE MAJOR MINOR PATCH BUILD) - string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]" TRT_TYPE_STRING - ${VERSION_STRINGS}) - string(REGEX MATCH "[0-9]" TRT_${TYPE} ${TRT_TYPE_STRING}) -endforeach(TYPE) - -foreach(TYPE MAJOR MINOR PATCH) - string(REGEX MATCH "NV_TENSORRT_SONAME_${TYPE} [0-9]" TRT_TYPE_STRING - ${VERSION_STRINGS}) - string(REGEX MATCH "[0-9]" TRT_SO_${TYPE} ${TRT_TYPE_STRING}) -endforeach(TYPE) - -set(TRT_VERSION - "${TRT_MAJOR}.${TRT_MINOR}.${TRT_PATCH}" - CACHE STRING "TensorRT project version") -set(TRT_SOVERSION - "${TRT_SO_MAJOR}" - CACHE STRING "TensorRT library so version") -message( - STATUS - "Building for TensorRT version: ${TRT_VERSION}, library version: ${TRT_SOVERSION}" -) - -list(APPEND COMMON_HEADER_DIRS ${TORCH_INCLUDE_DIRS} ${TRT_INCLUDE_DIR}) -include_directories(${COMMON_HEADER_DIRS}) - -target_include_directories( - triton-tensorrt-llm-backend - PRIVATE ${TRTLLM_DIR}/cpp - ${TRTLLM_DIR}/cpp/include - ${CMAKE_CURRENT_SOURCE_DIR}/src - ${CUDA_INCLUDE_DIRS} - ${CUDNN_ROOT_DIR}/include - ${NCCL_INCLUDE_DIR} - ${3RDPARTY_DIR}/cutlass/include - ${MPI_INCLUDE_PATH} - ${COMMON_HEADER_DIR}) - -target_compile_features(triton-tensorrt-llm-backend PRIVATE cxx_std_17) -target_compile_options( - triton-tensorrt-llm-backend - PRIVATE - $<$,$,$>: - -Wall - -Wextra - -Wno-unused-parameter - -Wno-type-limits> - $<$:/Wall - /D_WIN32_WINNT=0x0A00 - /EHsc>) - -add_library(tensorrt_llm STATIC IMPORTED) -set_property( - TARGET tensorrt_llm - PROPERTY IMPORTED_LOCATION - "${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm_static.a") - -add_library(tensorrt_llm_batch_manager STATIC IMPORTED) -execute_process( - COMMAND ${Python3_EXECUTABLE} "-c" - "import torch; print(torch.compiled_with_cxx11_abi(),end='');" - RESULT_VARIABLE _PYTHON_SUCCESS - OUTPUT_VARIABLE USE_CXX11_ABI) -message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}") -if(USE_CXX11_ABI) - set_property( - TARGET tensorrt_llm_batch_manager - PROPERTY - IMPORTED_LOCATION - "${TRTLLM_DIR}/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.a" - ) -else() - set_property( - TARGET tensorrt_llm_batch_manager - PROPERTY - IMPORTED_LOCATION - "${TRTLLM_DIR}/cpp/tensorrt_llm/batch_manager/libtensorrt_llm_batch_manager_static.pre_cxx11.a" - ) -endif() - -add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED) -set_property( - TARGET nvinfer_plugin_tensorrt_llm - PROPERTY - IMPORTED_LOCATION - "${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so" -) - -if(TRITON_BUILD) - add_dependencies(tensorrt_llm trtllm_target) - add_dependencies(tensorrt_llm_batch_manager trtllm_target) - add_dependencies(nvinfer_plugin_tensorrt_llm trtllm_target) -endif() - -target_link_libraries( - triton-tensorrt-llm-backend - PRIVATE tensorrt_llm_batch_manager - tensorrt_llm - triton-core-serverapi # from repo-core - triton-core-backendapi # from repo-core - triton-core-serverstub # from repo-core - triton-backend-utils # from repo-backend - ${MPI_LIBRARIES} - nvinfer - nvinfer_plugin_tensorrt_llm) - -FetchContent_Declare( - json - GIT_REPOSITORY https://github.com/nlohmann/json.git - GIT_TAG v3.11.2) - -FetchContent_MakeAvailable(json) - -target_link_libraries(triton-tensorrt-llm-backend - PRIVATE nlohmann_json::nlohmann_json) - -if(WIN32) - set_target_properties( - triton-tensorrt-llm-backend PROPERTIES POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_tensorrtllm) -else() - set_target_properties( - triton-tensorrt-llm-backend - PROPERTIES POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_tensorrtllm - LINK_DEPENDS - ${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript - LINK_FLAGS "-Wl,--version-script libtriton_tensorrtllm.ldscript") -endif() - -# -# Install -# -include(GNUInstallDirs) -set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonTensorRTLLMBackend) - -install( - TARGETS triton-tensorrt-llm-backend - EXPORT triton-tensorrt-llm-backend-targets - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm - RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm) - -if(TRITON_BUILD) - install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/trtllm_build/ - DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm) -endif() - -install( - EXPORT triton-tensorrt-llm-backend-targets - FILE TritonTensorRTLLMBackendTargets.cmake - NAMESPACE TritonTensorRTLLMBackend:: - DESTINATION ${INSTALL_CONFIGDIR}) - -include(CMakePackageConfigHelpers) -configure_package_config_file( - ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonTensorRTLLMBackendConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/TritonTensorRTLLMBackendConfig.cmake - INSTALL_DESTINATION ${INSTALL_CONFIGDIR}) - -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/TritonTensorRTLLMBackendConfig.cmake - DESTINATION ${INSTALL_CONFIGDIR}) - -# -# Export from build tree -# -export( - EXPORT triton-tensorrt-llm-backend-targets - FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonTensorRTLLMBackendTargets.cmake - NAMESPACE TritonTensorRTLLMBackend::) - -export(PACKAGE TritonTensorRTLLMBackend) diff --git a/inflight_batcher_llm/README.md b/inflight_batcher_llm/README.md deleted file mode 100644 index 4076501a..00000000 --- a/inflight_batcher_llm/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# Instructions to run TRT-LLM in-flight batching Triton backend: - -## Build TensorRT-LLM engine for inflight batching - -To configure a Triton server that runs a model using TensorRT-LLM, it is needed to compile a TensorRT-LLM engine for that model. - -For example, for LLaMA 7B, change to the `tensorrt_llm/examples/llama` directory: - -``` -cd tensorrt_llm/examples/llama -``` -Prepare the checkpoint of the model by following the instructions [here](https://huggingface.co/docs/transformers/main/en/model_doc/llama) and store it in a model directory. Then, create the engine: - -``` -python build.py --model_dir ${model_directory} \ - --dtype bfloat16 \ - --use_gpt_attention_plugin bfloat16 \ - --use_inflight_batching \ - --paged_kv_cache \ - --remove_input_padding \ - --use_gemm_plugin bfloat16 \ - --output_dir engines/bf16/1-gpu/ -``` - -To disable the support for in-flight batching (i.e. use the V1 batching mode), remove `--use_inflight_batching`. - -Similarly, for a GPT model, change to `tensorrt_llm/examples/gpt` directory: -``` -cd tensorrt_llm/examples/gpt - -``` -Prepare the model checkpoint following the instructions in the README file, store it in a model directory and build the TRT engine with: - -``` -python3 build.py --model_dir=${model_directory} \ - --dtype float16 \ - --use_inflight_batching \ - --use_gpt_attention_plugin float16 \ - --paged_kv_cache \ - --use_gemm_plugin float16 \ - --remove_input_padding \ - --use_layernorm_plugin float16 \ - --hidden_act gelu \ - --output_dir=engines/fp16/1-gpu -``` - -## Create a model repository folder - -First run: -``` -rm -rf triton_model_repo -mkdir triton_model_repo -cp -R all_models/inflight_batcher_llm/* triton_model_repo -``` - -Then copy the TRT engine to `triton_model_repo/tensorrt_llm/1/`. For example for the LLaMA 7B example above, run: - -``` -cp -R tensorrt_llm/examples/llama/engines/bf16/1-gpu/ triton_model_repo/tensorrt_llm/1 -``` - -For the GPT example above, run: -``` -cp -R tensorrt_llm/examples/gpt/engines/fp16/1-gpu/ triton_model_repo/tensorrt_llm/1 -``` - - -Edit the `triton_model_repo/tensorrt_llm/config.pbtxt` file and replace `${decoupled_mode}` with `True` or `False`, and `${engine_dir}` with `/triton_model_repo/tensorrt_llm/1/1-gpu/` since the `triton_model_repo` folder created above will be mounted to `/triton_model_repo` in the Docker container. Decoupled mode must be set to true if using the streaming option from the client. - - -To use V1 batching, the `config.pbtxt` should have: -``` -parameters: { - key: "gpt_model_type" - value: { - string_value: "V1" - } -} -``` - -For in-flight batching, use: -``` -parameters: { - key: "gpt_model_type" - value: { - string_value: "inflight_fused_batching" - } -} -``` - -## Launch the Triton server container using the model_repository you just created - -``` -docker run --rm -it --net host --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --gpus='"'device=0'"' -v $(pwd)/triton_model_repo:/triton_model_repo tritonserver:w_trt_llm_backend /bin/bash -c "tritonserver --model-repository=/triton_model_repo" -``` - -## Run the provided client to send a request - -You can test the inflight batcher server with the provided reference python client as following: -``` -python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 -``` - -You can also stop the generation process early by using the `--stop-after-ms` option to send a stop request after a few milliseconds: - -``` -python inflight_batcher_llm_client.py --stop-after-ms 200 --request-output-len 200 -``` - -You will find that the generation process is stopped early and therefore the number of generated tokens is lower than 200. - -You can have a look at the client code to see how early stopping is achieved. - -## Run the e2e/identity test to benchmark - -### End to end test -End to end test script sends requests to deployed ensemble model. - -Ensemble model is ensembled by three models: preprocessing, tensorrt_llm and postprocessing. -* preprocessing: Tokenizing, meaning the conversion from prompts(string) to input_ids(list of ints). -* tensorrt_llm: Inferencing. -* postprocessing: De-tokenizing, meaning the conversion from output_ids(list of ints) to outputs(string). - -The end to end latency includes the total latency of the three parts of an ensemble model. - -``` -cd tools/inflight_batcher_llm -python3 end_to_end_test.py --dataset -``` -Expected outputs -``` -[INFO] Functionality test succeed. -[INFO] Warm up for benchmarking. -[INFO] Start benchmarking on 125 prompts. -[INFO] Total Latency: 11099.243 ms -``` - -### Identity test - -Identity test script sends requests directly to deployed tensorrt_llm model, the identity test latency indicates the inference latency of TensorRT-LLM, not including the pre/post-processing latency which is usually handled by a third-party library such as HuggingFace. - -``` -cd tools/inflight_batcher_llm -python3 identity_test.py --dataset -``` -Expected outputs -``` -[INFO] Warm up for benchmarking. -[INFO] Start benchmarking on 125 prompts. -[INFO] Total Latency: 10213.462 ms -``` -*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.* diff --git a/inflight_batcher_llm/client/inflight_batcher_llm_client.py b/inflight_batcher_llm/client/inflight_batcher_llm_client.py deleted file mode 100755 index 6184e7a0..00000000 --- a/inflight_batcher_llm/client/inflight_batcher_llm_client.py +++ /dev/null @@ -1,434 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -import queue -import sys -import time -from functools import partial - -import numpy as np -import tritonclient.grpc as grpcclient -import tritonclient.http as httpclient -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer -from tritonclient.utils import InferenceServerException, np_to_triton_dtype - -# -# Simple streaming client for TRT-LLM inflight bacthing backend -# -# In order for this code to work properly, config.pbtxt must contain these values: -# -# model_transaction_policy { -# decoupled: True -# } -# -# parameters: { -# key: "gpt_model_type" -# value: { -# string_value: "inflight_batching" -# } -# } -# -# In order for gpt_model_type 'inflight_batching' to work, you must copy engine from -# -# tensorrt_llm/cpp/tests/resources/models/rt_engine/gpt2/fp16-inflight-batching-plugin/1-gpu/ -# - - -class UserData: - - def __init__(self): - self._completed_requests = queue.Queue() - - -def prepare_tensor(name, input, protocol): - client_util = httpclient if protocol == "http" else grpcclient - t = client_util.InferInput(name, input.shape, - np_to_triton_dtype(input.dtype)) - t.set_data_from_numpy(input) - return t - - -def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data, - beam_width_data, temperature_data, streaming_data): - protocol = 'grpc' - inputs = [ - prepare_tensor("input_ids", input_ids_data, protocol), - prepare_tensor("input_lengths", input_lengths_data, protocol), - prepare_tensor("request_output_len", request_output_len_data, - protocol), - prepare_tensor("beam_width", beam_width_data, protocol), - prepare_tensor("temperature", temperature_data, protocol), - prepare_tensor("streaming", streaming_data, protocol), - ] - - return inputs - - -def prepare_stop_signals(): - - inputs = [ - grpcclient.InferInput('input_ids', [1, 1], "INT32"), - grpcclient.InferInput('input_lengths', [1, 1], "INT32"), - grpcclient.InferInput('request_output_len', [1, 1], "UINT32"), - grpcclient.InferInput('stop', [1, 1], "BOOL"), - ] - - inputs[0].set_data_from_numpy(np.empty([1, 1], dtype=np.int32)) - inputs[1].set_data_from_numpy(np.zeros([1, 1], dtype=np.int32)) - inputs[2].set_data_from_numpy(np.array([[0]], dtype=np.uint32)) - inputs[3].set_data_from_numpy(np.array([[True]], dtype='bool')) - - return inputs - - -# Define the callback function. Note the last two parameters should be -# result and error. InferenceServerClient would povide the results of an -# inference as grpcclient.InferResult in result. For successful -# inference, error will be None, otherwise it will be an object of -# tritonclientutils.InferenceServerException holding the error details -def callback(user_data, result, error): - if error: - user_data._completed_requests.put(error) - else: - user_data._completed_requests.put(result) - if (FLAGS.streaming): - output_ids = result.as_numpy('output_ids') - tokens = list(output_ids[0][0]) - print(tokens, flush=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-v", - "--verbose", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable verbose output", - ) - parser.add_argument( - "-u", - "--url", - type=str, - required=False, - default="localhost:8001", - help="Inference server URL. Default is localhost:8001.", - ) - parser.add_argument( - '--text', - type=str, - required=False, - default='Born in north-east France, Soyer trained as a', - help='Input text') - parser.add_argument( - "-s", - "--ssl", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable SSL encrypted channel to the server", - ) - parser.add_argument( - "-t", - "--stream-timeout", - type=float, - required=False, - default=None, - help="Stream timeout in seconds. Default is None.", - ) - parser.add_argument( - "-r", - "--root-certificates", - type=str, - required=False, - default=None, - help="File holding PEM-encoded root certificates. Default is None.", - ) - parser.add_argument( - "-p", - "--private-key", - type=str, - required=False, - default=None, - help="File holding PEM-encoded private key. Default is None.", - ) - parser.add_argument( - "-x", - "--certificate-chain", - type=str, - required=False, - default=None, - help="File holding PEM-encoded certificate chain. Default is None.", - ) - parser.add_argument( - "-C", - "--grpc-compression-algorithm", - type=str, - required=False, - default=None, - help= - "The compression algorithm to be used when sending request to server. Default is None.", - ) - parser.add_argument( - "-S", - "--streaming", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable streaming mode. Default is False.", - ) - parser.add_argument( - "-c", - "--check-output", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable check of output ids for CI", - ) - - parser.add_argument( - "-b", - "--beam-width", - required=False, - type=int, - default=1, - help="Beam width value", - ) - parser.add_argument( - "--temperature", - type=float, - required=False, - default=1.0, - help="temperature value", - ) - parser.add_argument( - "--request-output-len", - type=int, - required=False, - default=16, - help="temperature value", - ) - parser.add_argument( - '--stop-after-ms', - type=int, - required=False, - default=0, - help='Early stop the generation after a few milliseconds') - parser.add_argument('--tokenizer_dir', - type=str, - required=True, - help='Specify tokenizer directory') - parser.add_argument('--tokenizer_type', - type=str, - default='auto', - required=False, - choices=['auto', 't5', 'llama'], - help='Specify tokenizer type') - - FLAGS = parser.parse_args() - - print('=========') - if FLAGS.tokenizer_type == 't5': - tokenizer = T5Tokenizer(vocab_file=FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'auto': - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'llama': - tokenizer = LlamaTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left') - else: - raise AttributeError( - f'Unexpected tokenizer type: {FLAGS.tokenizer_type}') - tokenizer.pad_token = tokenizer.eos_token - pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0] - - input_ids = [tokenizer.encode(FLAGS.text)] - input_ids_data = np.array(input_ids, dtype=np.int32) - input_lengths = [[len(ii)] for ii in input_ids] - input_lengths_data = np.array(input_lengths, dtype=np.int32) - request_output_len = [[FLAGS.request_output_len]] - request_output_len_data = np.array(request_output_len, dtype=np.uint32) - beam_width = [[FLAGS.beam_width]] - beam_width_data = np.array(beam_width, dtype=np.uint32) - temperature = [[FLAGS.temperature]] - temperature_data = np.array(temperature, dtype=np.float32) - streaming = [[FLAGS.streaming]] - streaming_data = np.array(streaming, dtype=bool) - - inputs = prepare_inputs(input_ids_data, input_lengths_data, - request_output_len_data, beam_width_data, - temperature_data, streaming_data) - - if FLAGS.stop_after_ms > 0: - stop_inputs = prepare_stop_signals() - else: - stop_inputs = None - - request_id = "" - - expected_output_ids = [ - input_ids[0] + [ - 21221, 290, 257, 4255, 379, 262, 1957, 7072, 11, 4689, 347, 2852, - 2564, 494, 13, 679 - ] - ] - if FLAGS.streaming: - actual_output_ids = [input_ids[0]] - else: - actual_output_ids = [] - - user_data = UserData() - with grpcclient.InferenceServerClient( - url=FLAGS.url, - verbose=FLAGS.verbose, - ssl=FLAGS.ssl, - root_certificates=FLAGS.root_certificates, - private_key=FLAGS.private_key, - certificate_chain=FLAGS.certificate_chain, - ) as triton_client: - try: - - if FLAGS.streaming: - - # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), - stream_timeout=FLAGS.stream_timeout, - ) - # Send request - triton_client.async_stream_infer( - 'tensorrt_llm', - inputs, - request_id=request_id, - ) - - if stop_inputs is not None: - - time.sleep(FLAGS.stop_after_ms / 1000.0) - - triton_client.async_stream_infer( - 'tensorrt_llm', - stop_inputs, - request_id=request_id, - parameters={'Streaming': FLAGS.streaming}) - - #Wait for server to close the stream - triton_client.stop_stream() - - # Parse the responses - while True: - try: - result = user_data._completed_requests.get(block=False) - except Exception: - break - - if type(result) == InferenceServerException: - print("Received an error from server:") - print(result) - else: - output_ids = result.as_numpy('output_ids') - - if output_ids is not None: - if (FLAGS.streaming): - # Only one beam is supported - tokens = list(output_ids[0][0]) - actual_output_ids[ - 0] = actual_output_ids[0] + tokens - else: - for beam_output_ids in output_ids[0]: - tokens = list(beam_output_ids) - actual_output_ids.append(tokens) - else: - print("Got cancellation response from server") - else: - # Send request - triton_client.async_infer( - 'tensorrt_llm', - inputs, - request_id=request_id, - callback=partial(callback, user_data), - parameters={'Streaming': FLAGS.streaming}) - - if stop_inputs is not None: - - time.sleep(FLAGS.stop_after_ms / 1000.0) - - triton_client.async_infer( - 'tensorrt_llm', - stop_inputs, - request_id=request_id, - callback=partial(callback, user_data), - parameters={'Streaming': FLAGS.streaming}) - - processed_count = 0 - expected_responses = 1 + (1 if stop_inputs is not None else 0) - while processed_count < expected_responses: - try: - result = user_data._completed_requests.get() - print("Got completed request", flush=True) - except Exception: - break - - if type(result) == InferenceServerException: - print("Received an error from server:") - print(result) - else: - output_ids = result.as_numpy('output_ids') - if output_ids is not None: - for beam_output_ids in output_ids[0]: - tokens = list(beam_output_ids) - actual_output_ids.append(tokens) - else: - print("Got response for cancellation request") - - processed_count = processed_count + 1 - except Exception as e: - print("channel creation failed: " + str(e)) - sys.exit() - - passed = True - - print("output_ids = ", actual_output_ids) - output_ids = np.array(actual_output_ids) - output_ids = output_ids.reshape( - (output_ids.size, )).tolist()[input_ids_data.shape[1]:] - output_text = tokenizer.decode(output_ids) - print(f'Input: {FLAGS.text}') - print(f'Output: {output_text}') - if (FLAGS.check_output): - passed = (actual_output_ids == expected_output_ids) - print("expected_output_ids = ", expected_output_ids) - print("\n=====") - print("PASS!" if passed else "FAIL!") - print("=====") - - sys.exit(not passed) diff --git a/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in b/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in deleted file mode 100644 index 0db8a7f2..00000000 --- a/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -include(CMakeFindDependencyMacro) - -get_filename_component( - TRITONTRTLLMBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH -) - -list(APPEND CMAKE_MODULE_PATH ${TRITONTRTLLMBACKEND_CMAKE_DIR }) - -if(NOT TARGET TritonTRTLLMBackend::triton-trtllm-backend) - include("${TRITONTRTLLMBACKEND_CMAKE_DIR }/TritonTRTLLMBackendTargets.cmake") -endif() - -set(TRITONTRTLLMBACKEND_LIBRARIES TritonTRTLLMBackend::triton-trtllm-backend) diff --git a/inflight_batcher_llm/cmake/modules/set_ifndef.cmake b/inflight_batcher_llm/cmake/modules/set_ifndef.cmake deleted file mode 100644 index bd8f0a3e..00000000 --- a/inflight_batcher_llm/cmake/modules/set_ifndef.cmake +++ /dev/null @@ -1,24 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & -# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. -# - -function(set_ifndef variable value) - if(NOT DEFINED ${variable}) - set(${variable} - ${value} - PARENT_SCOPE) - endif() -endfunction() diff --git a/inflight_batcher_llm/scripts/build.sh b/inflight_batcher_llm/scripts/build.sh deleted file mode 100644 index f7dc016c..00000000 --- a/inflight_batcher_llm/scripts/build.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -TRT_ROOT=${1:-'/usr/local/tensorrt'} - -set -x -apt-get update -apt-get install -y --no-install-recommends rapidjson-dev - -BUILD_DIR=$(dirname $0)/../build -mkdir $BUILD_DIR -BUILD_DIR=$(cd -- "$BUILD_DIR" && pwd) -cd $BUILD_DIR - -cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \ - -DTRT_LIB_DIR=${TRT_ROOT}/targets/x86_64-linux-gnu/lib \ - -DTRT_INCLUDE_DIR=${TRT_ROOT}/include .. -make install diff --git a/inflight_batcher_llm/src/libtensorrtllm.cc b/inflight_batcher_llm/src/libtensorrtllm.cc deleted file mode 100644 index fcfd5b35..00000000 --- a/inflight_batcher_llm/src/libtensorrtllm.cc +++ /dev/null @@ -1,1277 +0,0 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#define _GLIBCXX_USE_CXX11_ABI 0 -#include -#include -#include -#include -#include -#include -#include - -#include "triton/backend/backend_common.h" -#include "triton/backend/backend_input_collector.h" -#include "triton/backend/backend_model.h" -#include "triton/backend/backend_model_instance.h" -#include "triton/backend/backend_output_responder.h" -#include "triton/core/tritonbackend.h" -#include "triton/core/tritonserver.h" - -#include "tensorrt_llm/batch_manager/GptManager.h" -#include "tensorrt_llm/batch_manager/NamedTensor.h" -#include "tensorrt_llm/batch_manager/callbacks.h" -#include "tensorrt_llm/batch_manager/inferenceRequest.h" -#include "tensorrt_llm/common/logger.h" -#include "tensorrt_llm/plugins/api/tllmPlugin.h" -#include "tensorrt_llm/runtime/tllmLogger.h" - -#include - -#include "mpiUtils.h" - -using namespace ::triton::common; // TritonJson - -// -// Mockup of LLM inflight batcher based on triton 'minimal' backend example -// - -using namespace tensorrt_llm::batch_manager; -using namespace tensorrt_llm::runtime; -using namespace std::placeholders; // for _1, _2 etc. - -// template class inflight_batcher::batch_manager::GPTManager; - -namespace triton -{ -namespace backend -{ -namespace inflight_batcher_llm -{ - -inline static const std::string kStopInputTensorName = "stop"; -inline static const std::string kStreamingInputTensorName = "streaming"; - -bool getRequestBooleanInputTensor(TRITONBACKEND_Request* request, const std::string& inputTensorName) -{ - // Get stop signal from the request - TRITONBACKEND_Input* input; - TRITONSERVER_Error* error = TRITONBACKEND_RequestInput(request, inputTensorName.c_str(), &input); - if (error) - { - // If the user does not provide input "stop", then regard the request as - // unstopped - std::string msg - = "ModelInstanceState::getRequestBooleanInputTensor: user " - "did not not provide " - + inputTensorName + " input for the request"; - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, msg.c_str()); - return false; - } - - uint64_t input_byte_size = 0; - uint32_t buffer_count = 0; - TRITONBACKEND_InputProperties(input, nullptr, nullptr, nullptr, nullptr, &input_byte_size, &buffer_count); - - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, - ("ModelInstanceState::getRequestStopSignal: buffer_count = " + std::to_string(buffer_count)).c_str()); - - const void* buffer = 0L; - uint64_t buffer_byte_size = 0; - TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU; - int64_t memory_type_id = 0; - TRITONBACKEND_InputBuffer(input, 0, &buffer, &buffer_byte_size, &memory_type, &memory_type_id); - - assert((memory_type == TRITONSERVER_MEMORY_CPU) || (memory_type == TRITONSERVER_MEMORY_CPU_PINNED)); - - bool boolean = *reinterpret_cast(buffer); - - return boolean; -} - -nvinfer1::DataType to_trt_datatype(TRITONSERVER_DataType data_type) -{ - if (data_type == TRITONSERVER_TYPE_INVALID) - { - assert(false); - } - else if (data_type == TRITONSERVER_TYPE_BOOL) - { - return nvinfer1::DataType::kBOOL; - } - else if (data_type == TRITONSERVER_TYPE_UINT8) - { - return nvinfer1::DataType::kUINT8; - } - else if (data_type == TRITONSERVER_TYPE_UINT16) - { - assert(false); - } - else if (data_type == TRITONSERVER_TYPE_UINT32) - { - return nvinfer1::DataType::kINT32; - } - else if (data_type == TRITONSERVER_TYPE_UINT64) - { - return nvinfer1::DataType::kINT64; - } - else if (data_type == TRITONSERVER_TYPE_INT8) - { - return nvinfer1::DataType::kINT8; - } - else if (data_type == TRITONSERVER_TYPE_INT16) - { - assert(false); - } - else if (data_type == TRITONSERVER_TYPE_INT32) - { - return nvinfer1::DataType::kINT32; - } - else if (data_type == TRITONSERVER_TYPE_INT64) - { - return nvinfer1::DataType::kINT64; - } - else if (data_type == TRITONSERVER_TYPE_FP16) - { - return nvinfer1::DataType::kBF16; - } - else if (data_type == TRITONSERVER_TYPE_FP32) - { - return nvinfer1::DataType::kFLOAT; - } - else if (data_type == TRITONSERVER_TYPE_FP64) - { - assert(false); - } - else if (data_type == TRITONSERVER_TYPE_BYTES) - { - return nvinfer1::DataType::kINT8; - } - else if (data_type == TRITONSERVER_TYPE_BF16) - { - return nvinfer1::DataType::kBF16; - } - else - { - assert(false); - } - return nvinfer1::DataType(0); -} - -TRITONSERVER_DataType to_triton_datatype(nvinfer1::DataType data_type) -{ - if (data_type == nvinfer1::DataType::kBOOL) - { - return TRITONSERVER_TYPE_BOOL; - } - else if (data_type == nvinfer1::DataType::kUINT8) - { - return TRITONSERVER_TYPE_UINT8; - } - else if (data_type == nvinfer1::DataType::kHALF) - { - return TRITONSERVER_TYPE_BF16; - } - else if (data_type == nvinfer1::DataType::kINT8) - { - return TRITONSERVER_TYPE_INT8; - } - else if (data_type == nvinfer1::DataType::kINT32) - { - return TRITONSERVER_TYPE_INT32; - } - else if (data_type == nvinfer1::DataType::kINT64) - { - return TRITONSERVER_TYPE_INT64; - } - else if (data_type == nvinfer1::DataType::kFLOAT) - { - return TRITONSERVER_TYPE_FP32; - } - else if (data_type == nvinfer1::DataType::kBF16) - { - return TRITONSERVER_TYPE_BF16; - } - else - { - return TRITONSERVER_TYPE_INVALID; - } -} - -///////////// - -// -// ModelState -// -// State associated with a model that is using this backend. An object -// of this class is created and associated with each -// TRITONBACKEND_Model. -// -class ModelState -{ -public: - static TRITONSERVER_Error* Create(TRITONBACKEND_Model* triton_model, ModelState** state); - - template - T GetParameter(const std::string& name) - { - assert(false); - } - - virtual ~ModelState() = default; - - common::TritonJson::Value& GetModelConfig(); - -private: - common::TritonJson::Value model_config_; - std::shared_ptr mTrtLogger{}; - - ModelState(TRITONBACKEND_Model* triton_model, TritonJson::Value&& model_config) - : model_config_(std::move(model_config)) - { - mTrtLogger = std::make_shared(); - initTrtLlmPlugins(mTrtLogger.get()); - } -}; - -TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) -{ - TRITONSERVER_Message* config_message; - RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(triton_model, 1 /* config_version */, &config_message)); - - // We can get the model configuration as a json string from - // config_message, parse it with our favorite json parser to create - // DOM that we can access when we need to example the - // configuration. We use TritonJson, which is a wrapper that returns - // nice errors (currently the underlying implementation is - // rapidjson... but others could be added). You can use any json - // parser you prefer. - const char* buffer; - size_t byte_size; - RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size)); - - common::TritonJson::Value model_config; - TRITONSERVER_Error* err = model_config.Parse(buffer, byte_size); - RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message)); - RETURN_IF_ERROR(err); - - try - { - *state = new ModelState(triton_model, std::move(model_config)); - } - catch (const std::exception& ex) - { - std::string errStr = std::string("unexpected error when creating modelState: ") + ex.what(); - return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str()); - } - - return nullptr; // success -} - -common::TritonJson::Value& ModelState::GetModelConfig() -{ - return model_config_; -} - -template <> -std::string ModelState::GetParameter(const std::string& name) -{ - TritonJson::Value parameters; - TRITONSERVER_Error* err = model_config_.MemberAsObject("parameters", ¶meters); - if (err != nullptr) - { - throw std::runtime_error("Model config doesn't have a parameters section"); - TRITONSERVER_ErrorDelete(err); - } - TritonJson::Value value; - std::string str_value; - err = parameters.MemberAsObject(name.c_str(), &value); - if (err != nullptr) - { - std::string errStr = "Cannot find parameter with name: " + name; - throw std::runtime_error(errStr); - TRITONSERVER_ErrorDelete(err); - } - value.MemberAsString("string_value", &str_value); - return str_value; -} - -template <> -int32_t ModelState::GetParameter(const std::string& name) -{ - return std::stoi(GetParameter(name)); -} - -template <> -uint32_t ModelState::GetParameter(const std::string& name) -{ - return (uint32_t) std::stoul(GetParameter(name)); -} - -template <> -int64_t ModelState::GetParameter(const std::string& name) -{ - return std::stoll(GetParameter(name)); -} - -template <> -uint64_t ModelState::GetParameter(const std::string& name) -{ - return std::stoull(GetParameter(name)); -} - -template <> -float ModelState::GetParameter(const std::string& name) -{ - return std::stof(GetParameter(name)); -} - -template <> -bool ModelState::GetParameter(const std::string& name) -{ - auto val = GetParameter(name); - if (val == "True" || val == "true" || val == "TRUE" || val == "1") - { - return true; - } - else if (val == "False" || val == "false" || val == "FALSE" || val == "0") - { - return false; - } - else - { - std::string err = "Cannot convert " + val + " to a boolean."; - throw std::runtime_error(err); - } -} - -extern "C" -{ - - // Triton calls TRITONBACKEND_ModelInitialize when a model is loaded - // to allow the backend to create any state associated with the model, - // and to also examine the model configuration to determine if the - // configuration is suitable for the backend. Any errors reported by - // this function will prevent the model from loading. - // - TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) - { - // Create a ModelState object and associate it with the - // TRITONBACKEND_Model. If anything goes wrong with initialization - // of the model state then an error is returned and Triton will fail - // to load the model. - ModelState* model_state; - RETURN_IF_ERROR(ModelState::Create(model, &model_state)); - RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); - - return nullptr; // success - } - - // Triton calls TRITONBACKEND_ModelFinalize when a model is no longer - // needed. The backend should cleanup any state associated with the - // model. This function will not be called until all model instances - // of the model have been finalized. - // - TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) - { - void* vstate; - RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); - ModelState* model_state = reinterpret_cast(vstate); - delete model_state; - - return nullptr; // success - } - -} // extern "C" - -///////////// - -// Class holding all infos regarding a single work item. -// This includes the original request, associated response factor -// and state. -class WorkItem -{ -public: - WorkItem(TRITONBACKEND_Request* request, bool isDecoupled) - { - mRequestId = (rand() % INT64_MAX) + 1; - mInferenceRequest = createInferenceRequest(request, mRequestId, isDecoupled); - - // Create response factory for this request - TRITONBACKEND_ResponseFactoryNew(&factory_ptr_, request); - } - - WorkItem(TRITONBACKEND_Request* request, uint64_t request_id, bool isDecoupled) - : mRequestId(request_id) - { - mInferenceRequest = createInferenceRequest(request, mRequestId, isDecoupled); - - // Create response factory for this request - TRITONBACKEND_ResponseFactoryNew(&factory_ptr_, request); - } - - WorkItem(std::shared_ptr ir, uint64_t RequestId) - : mInferenceRequest(ir) - , mRequestId(RequestId) - { - factory_ptr_ = nullptr; - } - - ~WorkItem() - { - if (factory_ptr_ != nullptr) - { - TRITONBACKEND_ResponseFactoryDelete(factory_ptr_); - } - } - - TRITONBACKEND_ResponseFactory* response_factory() - { - assert(factory_ptr_ != nullptr); - return factory_ptr_; - } - - uint64_t requestId() const - { - return mRequestId; - } - - std::shared_ptr getInferenceRequest() const - { - return mInferenceRequest; - } - -private: - // Convert info from original backend request to data structures defined in - // common/common.h - std::shared_ptr createInferenceRequest( - TRITONBACKEND_Request* request, uint64_t requestId, bool isDecoupled) - { - auto inferenceRequest = std::make_shared(requestId); - - // Extract input tensors - std::map input_tensors; - uint32_t num_inputs; - LOG_IF_ERROR(TRITONBACKEND_RequestInputCount(request, &num_inputs), "Error getting input count"); - for (uint32_t idx = 0; idx < num_inputs; ++idx) - { - TRITONBACKEND_Input* input = 0L; - TRITONBACKEND_RequestInputByIndex(request, idx, &input); - - const char* input_name = 0L; - TRITONSERVER_DataType data_type = TRITONSERVER_TYPE_INVALID; - const int64_t* shape = 0L; - uint32_t dims_count = 0; - uint64_t byte_size = 0; - uint32_t buffer_count = 0; - TRITONBACKEND_InputProperties( - input, &input_name, &data_type, &shape, &dims_count, &byte_size, &buffer_count); - - if (std::string(input_name) == "START" || std::string(input_name) == "CORRID" - || std::string(input_name) == "END" || std::string(input_name) == kStopInputTensorName - || std::string(input_name) == kStreamingInputTensorName) - { - continue; - } - - std::vector shapev; - for (uint32_t i = 0; i < dims_count; ++i) - { - shapev.push_back(shape[i]); - } - - NamedTensor t(to_trt_datatype(data_type), shapev, input_name); - uint64_t buffer_offset = 0; - for (int64_t buffer_id = 0; buffer_id < buffer_count; ++buffer_id) - { - const void* buffer = 0L; - uint64_t buffer_byte_size = 0; - TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU; - int64_t memory_type_id = 0; - TRITONBACKEND_InputBuffer(input, buffer_id, &buffer, &buffer_byte_size, &memory_type, &memory_type_id); - assert((memory_type == TRITONSERVER_MEMORY_CPU) || (memory_type == TRITONSERVER_MEMORY_CPU_PINNED)); - // TODO: Do we need to handle GPU mem input buffers?? - std::memcpy(static_cast(t.tensor->data()) + buffer_offset, buffer, buffer_byte_size); - buffer_offset += buffer_byte_size; - } - - inferenceRequest->emplaceInputTensor(t.name, std::move(t.tensor)); - } - - bool streamingFlag = getRequestBooleanInputTensor(request, kStreamingInputTensorName); - inferenceRequest->setIsStreaming(streamingFlag); - - if (streamingFlag && !isDecoupled) - { - throw std::runtime_error( - "Streaming is only supported if model is " - "deployed using decoupled mode."); - } - - return inferenceRequest; - } - - std::shared_ptr mInferenceRequest; - TRITONBACKEND_ResponseFactory* factory_ptr_; - uint64_t mRequestId; -}; - -/// @brief Thread-safe queue of work items - -class WorkItemsQueue -{ -public: - void clear() - { - std::lock_guard lk(mMutex); - mPendingWorkItems.clear(); - mPendingWorkItemsReqIds.clear(); - mInProgressWorkItems.clear(); - mStoppedReqIds.clear(); - } - - // Note: this function only be called under a lock - bool hasInProgressReqId(const uint64_t reqId) const - { - return (mInProgressWorkItems.find(reqId) != mInProgressWorkItems.end()); - } - - // Note: this function only be called under a lock - bool hasPendingReqId(const uint64_t reqId) const - { - return (mPendingWorkItemsReqIds.find(reqId) != mPendingWorkItemsReqIds.end()); - } - - /// @brief Add a new work item to the queue - /// Throws an error if requestId already exists - - void push(TRITONBACKEND_Request* request, uint64_t requestId, bool isDecoupled) - { - std::lock_guard lk(mMutex); - if (hasInProgressReqId(requestId) || hasPendingReqId(requestId)) - { - std::string errStr - = "requestId " + std::to_string(requestId) + " is already in progress, request is ignored."; - throw std::runtime_error(errStr); - } - else - { - auto workItem = std::make_shared(request, requestId, isDecoupled); - mPendingWorkItems.push_back(workItem); - mPendingWorkItemsReqIds.insert(workItem->requestId()); - } - } - - void push(TRITONBACKEND_Request* request, bool isDecoupled) - { - std::lock_guard lk(mMutex); - auto workItem = std::make_shared(request, isDecoupled); - mPendingWorkItems.push_back(workItem); - mPendingWorkItemsReqIds.insert(workItem->requestId()); - } - - /// @brief Get a new work item from the queue, and move it to the list of - /// in progress work items if it hasn't been stopped - /// @return A tuple of the workItem and a boolean flag indicating if the work - /// item has been marked in progress - std::tuple, bool> pop() - { - std::lock_guard lk(mMutex); - - auto workItem = mPendingWorkItems.front(); - mPendingWorkItems.pop_front(); - mPendingWorkItemsReqIds.erase(workItem->requestId()); - - bool markedInProgress; - // Check if work item has been stopped - if (mStoppedReqIds.find(workItem->requestId()) == mStoppedReqIds.end()) - { - mInProgressWorkItems.emplace(std::make_pair(workItem->requestId(), workItem)); - markedInProgress = true; - } - else - { - mStoppedReqIds.erase(workItem->requestId()); - markedInProgress = false; - } - - return {workItem, markedInProgress}; - } - - size_t numPendingWorkItems() const - { - std::lock_guard lk(mMutex); - return mPendingWorkItems.size(); - } - - std::shared_ptr getInProgressWorkItem(uint64_t requestId) - { - std::lock_guard lk(mMutex); - return mInProgressWorkItems.at(requestId); - } - - /// @brief Mark a request as being finished - /// @param requestId - void markFinished(const uint64_t requestId) - { - std::lock_guard lk(mMutex); - if (hasInProgressReqId(requestId)) - { - mInProgressWorkItems.erase(requestId); - } - - if (mStoppedReqIds.find(requestId) != mStoppedReqIds.end()) - { - mStoppedReqIds.erase(requestId); - } - } - - // Stop a request by adding the request Id to a set - // The set of stopped request id is used by the poll callback - // and the pop function - void stopWorkItem(const uint64_t requestId) - { - std::lock_guard lk(mMutex); - TLLM_LOG_DEBUG("Stopping request"); - if (hasInProgressReqId(requestId) || hasPendingReqId(requestId)) - { - mStoppedReqIds.emplace(requestId); - } - else - { - std::string errStr = std::string("Received stop request for requestId ") + std::to_string(requestId) - + std::string(" but it's not active (might be completed already)."); - throw std::runtime_error(errStr); - } - } - - std::unordered_set getStoppedReqIds() const - { - std::lock_guard lk(mMutex); - return mStoppedReqIds; - } - -private: - /// Queue of work items - std::list> mPendingWorkItems; - /// requestIds of work items in the queue - std::set mPendingWorkItemsReqIds; - - /// work items currently in progress - std::unordered_map> mInProgressWorkItems; - - /// ids of the work items that have been stopped - std::unordered_set mStoppedReqIds; - - mutable std::mutex mMutex; -}; - -// -// ModelInstanceState -// -// State associated with a model instance. An object of this class is -// created and associated with each -// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from -// -class ModelInstanceState -{ -public: - static TRITONSERVER_Error* Create( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state); - - virtual ~ModelInstanceState() - { - // terminate decoupled execution loop - { - mWorkItemsQueue.clear(); - } - } - - // Get the state of the model that corresponds to this instance. - ModelState* StateForModel() const - { - return model_state_; - } - - bool isDecoupled() const - { - return mIsDecoupled; - } - - uint64_t getRequestId(TRITONBACKEND_Request* request) - { - const char* charRequestId; - TRITONBACKEND_RequestId(request, &charRequestId); - uint64_t requestId = 0; - if (charRequestId != nullptr) - { - std::string strRequestId(charRequestId); - if (!strRequestId.empty()) - { - try - { - requestId = stoul(strRequestId); - } - catch (const std::exception& e) - { - std::string err = std::string("Invalid requestId, must be uint64_t. Got ") + strRequestId; - throw std::runtime_error(err); - } - } - } - - return requestId; - } - - // For stop requests, or in case of error during enqueue, we need to send a - // response to the client - void sendEnqueueResponse(TRITONBACKEND_Request* request, const std::string& errMsg = "") - { - TRITONBACKEND_ResponseFactory* factory_ptr; - // Create response factory for this request - LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request), "Cannot create response factory"); - - TRITONSERVER_Error* err = nullptr; - if (!errMsg.empty()) - { - TLLM_LOG_ERROR(errMsg); - err = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errMsg.c_str()); - } - TRITONBACKEND_Response* response; - LOG_IF_ERROR(TRITONBACKEND_ResponseNewFromFactory(&response, factory_ptr), "Cannot create response"); - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend(response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), "Cannot send response"); - LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryDelete(factory_ptr), "Cannot delete response factory"); - } - - void enqueue(TRITONBACKEND_Request** requests, const uint32_t request_count, bool isDecoupled) - { - for (uint32_t r = 0; r < request_count; ++r) - { - - TRITONBACKEND_Request* request = requests[r]; - try - { - auto requestId = getRequestId(request); - bool stopRequest = getRequestBooleanInputTensor(request, kStopInputTensorName); - - if (requestId != 0) - { - if (stopRequest) - { - // Check if request is in progress or in queue, if not ignore - mWorkItemsQueue.stopWorkItem(requestId); - // Send a response back to client for stop request - sendEnqueueResponse(request); - } - else - { - mWorkItemsQueue.push(request, requestId, isDecoupled); - } - } - else if (!stopRequest) - { - mWorkItemsQueue.push(request, isDecoupled); - } - else - { - throw std::runtime_error("Cannot send stop request without specifying a request_id"); - } - } - catch (const std::exception& e) - { - // In case of error, no work item is added to queue, so response - // callback needs to be called - sendEnqueueResponse(request, e.what()); - } - } - return; - } - - // Return up to max_num_requests inference requests. - std::list> get_inference_requests(const int max_num_requests) - { - std::list> rval; - if (max_num_requests > 0) - { - auto world_size = getCommWorldSize(); - auto rank = getCommWorldRank(); - if (rank == 0) - { - int64_t num_new_work_items = std::min(static_cast(mWorkItemsQueue.numPendingWorkItems()), - static_cast(max_num_requests)); - if (world_size > 1) - { - bcast(&num_new_work_items, 1, MPI_TYPE_INT64_T, 0); - } - - if (num_new_work_items > 0) - { - int count = 0; - while (count < num_new_work_items) - { - auto [workItem, markedInProgress] = mWorkItemsQueue.pop(); - - if (markedInProgress) - { - rval.emplace_back(workItem->getInferenceRequest()); - count++; - } - else - { - std::string warnStr = std::string("request Id ") + std::to_string(workItem->requestId()) - + std::string(" has been stopped. Request is ignored."); - TLLM_LOG_WARNING(warnStr); - sendTritonResponse(workItem, {}, true, warnStr); - } - } - if (world_size > 1) - { - std::vector packed; - for (auto ir : rval) - { - auto vpacked = ir->serialize(); - packed.push_back(static_cast(vpacked.size())); - packed.insert( - packed.end(), std::move_iterator(vpacked.begin()), std::move_iterator(vpacked.end())); - } - int64_t nWords1 = static_cast(packed.size()); - bcast(&nWords1, 1, MPI_TYPE_INT64_T, 0); - bcast(packed, 0); - } - } - } - else - { - // subordinate ranks hang until master rank sends work - int64_t num_new_work_items; - bcast(&num_new_work_items, 1, MPI_TYPE_INT64_T, 0); - if (num_new_work_items > 0) - { - int nWords1; - bcast(&nWords1, 1, MPI_TYPE_INT64_T, 0); - std::vector packed(nWords1); - bcast(packed, 0); - int64_t* packed_ptr = packed.data(); - for (int64_t count = 0; count < num_new_work_items; ++count) - { - int64_t n = *(packed_ptr++); - auto ir = InferenceRequest::deserialize(packed_ptr); - packed_ptr += n; - rval.emplace_back(ir); - } - } - } - } - return rval; - } - - TRITONSERVER_Error* sendTritonResponse(std::shared_ptr workItem, - std::list const& response_tensors, bool final_response, const std::string& errMsg) - { - TRITONBACKEND_ResponseFactory* response_factory; - response_factory = workItem->response_factory(); - - TRITONBACKEND_Response* response; - RETURN_IF_ERROR(TRITONBACKEND_ResponseNewFromFactory(&response, response_factory)); - - auto requestId = workItem->requestId(); - if (final_response) - { - mWorkItemsQueue.markFinished(requestId); - } - - // Check if error - TRITONSERVER_Error* err = nullptr; - if (!errMsg.empty()) - { - std::string errStr = "Encountered error for requestId " + std::to_string(requestId) + ": " + errMsg; - TLLM_LOG_ERROR(errStr); - - err = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str()); - final_response = true; - } - else - { - for (auto it = response_tensors.begin(); it != response_tensors.end(); ++it) - { - auto tensor = *it; - auto shape = tensor.tensor->getShape(); // returns std::vectorint64_t> - std::vector vshape(shape.nbDims); - for (int i = 0; i < vshape.size(); ++i) - { - vshape[i] = shape.d[i]; - } - - TRITONBACKEND_Output* output; - RETURN_IF_ERROR(TRITONBACKEND_ResponseOutput(response, &output, tensor.name.c_str(), - to_triton_datatype(tensor.tensor->getDataType()), vshape.data(), shape.nbDims)); - - uint64_t buffersize = tensor.tensor->getSizeInBytes(); - void* buffer = 0L; - TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU; - int64_t memory_type_id = 0; - RETURN_IF_ERROR(TRITONBACKEND_OutputBuffer(output, &buffer, buffersize, &memory_type, &memory_type_id)); - if (memory_type != TRITONSERVER_MEMORY_CPU && memory_type != TRITONSERVER_MEMORY_CPU_PINNED) - { - std::string errStr = "Triton failed to allocate output buffer on CPU"; - err = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str()); - break; - } - std::memcpy(buffer, tensor.tensor->data(), buffersize); - } - } - - RETURN_IF_ERROR( - TRITONBACKEND_ResponseSend(response, final_response ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0, err)); - - return nullptr; - } - - void sendResponse(uint64_t requestId, std::list const& response_tensors, bool final_response, - const std::string& errMsg) - { - if (getCommWorldRank() == 0) - { - std::string errStr - = std::string("Failed to send Triton response for requestId: ") + std::to_string(requestId); - try - { - auto workItem = mWorkItemsQueue.getInProgressWorkItem(requestId); - auto tritonErr = sendTritonResponse(workItem, response_tensors, final_response, errMsg); - LOG_IF_ERROR(tritonErr, errStr); - } - catch (const std::exception& e) - { - TLLM_LOG_ERROR(errStr); - } - } - } - - std::unordered_set pollStopSignals() - { - auto stoppedReqIds = mWorkItemsQueue.getStoppedReqIds(); - int64_t nStoppedReqIds = static_cast(stoppedReqIds.size()); - - if (getCommWorldSize() > 1) - { - // Broadcast number of stopped requests - bcast(&nStoppedReqIds, 1, MPI_TYPE_INT64_T, 0); - - if (nStoppedReqIds > 0) - { - // Broadcast stopped requests Ids - if (getCommWorldRank() == 0) - { - // Store the requestIds in a contiguous vector - std::vector stoppedReqIdsVec(stoppedReqIds.begin(), stoppedReqIds.end()); - bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), MPI_TYPE_UINT64_T, 0); - } - else - { - std::vector stoppedReqIdsVec(nStoppedReqIds); - bcast(stoppedReqIdsVec.data(), stoppedReqIdsVec.size(), MPI_TYPE_UINT64_T, 0); - // Store the requestIds in the set - stoppedReqIds.clear(); - std::copy(stoppedReqIdsVec.begin(), stoppedReqIdsVec.end(), - std::inserter(stoppedReqIds, stoppedReqIds.end())); - } - } - } - return stoppedReqIds; - } - - void logStats(const std::string& s) - { - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, s.c_str()); - } - -private: - ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) - : model_state_(model_state) - , mIsDecoupled(false) - { - // Note: std::string::compare fails this test (always return non-zero - // value). Using old school strcmp instead. - if (model_state_->GetParameter("gpt_model_type") == "V1" - || model_state_->GetParameter("gpt_model_type") == "v1") - { - mTrtGptModelType = TrtGptModelType::V1; - } - else if (model_state_->GetParameter("gpt_model_type") == "inflight_batching") - { - mTrtGptModelType = TrtGptModelType::InflightBatching; - } - else if (model_state_->GetParameter("gpt_model_type") == "inflight_fused_batching") - { - mTrtGptModelType = TrtGptModelType::InflightFusedBatching; - } - else - { - throw std::runtime_error( - "Invalid gpt_model_type. Must be " - "v1/inflight_batching/inflight_fused_batching."); - } - - // Check if model is in decoupled mode: - triton::common::TritonJson::Value transaction_policy; - model_state_->GetModelConfig().MemberAsObject("model_transaction_policy", &transaction_policy); - transaction_policy.MemberAsBool("decoupled", &mIsDecoupled); - - // Note: std::string::compare fails this test (always return non-zero - // value). Using old school strcmp instead. - mModelPath = model_state_->GetParameter("gpt_model_path"); - auto configPath = mModelPath + "/config.json"; - std::ifstream jsonStream(configPath); - - auto constexpr allowExceptions = true; - auto constexpr ingoreComments = true; - auto json = nlohmann::json::parse(jsonStream, nullptr, allowExceptions, ingoreComments); - - int32_t maxBeamWidth = 1; - try - { - maxBeamWidth = model_state_->GetParameter("max_beam_width"); - } - catch (const std::exception& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("max_beam_width is not specified, will use default value of 1"); - } - - std::optional maxTokensInPagedKvCache = std::nullopt; - try - { - maxTokensInPagedKvCache = model_state_->GetParameter("max_tokens_in_paged_kv_cache"); - } - catch (const std::exception& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING( - "max_tokens_in_paged_kv_cache is not specified, will " - "use default value"); - } - - auto schedulerPolicy = batch_scheduler::SchedulerPolicy::GUARANTEED_NO_EVICT; - try - { - std::string schedulerPolicyStr = model_state_->GetParameter("batch_scheduler_policy"); - if (schedulerPolicyStr == "max_utilization") - { - schedulerPolicy = batch_scheduler::SchedulerPolicy::MAX_UTILIZATION; - } - else if (schedulerPolicyStr == "guaranteed_no_evict") - { - schedulerPolicy = batch_scheduler::SchedulerPolicy::GUARANTEED_NO_EVICT; - } - else - { - throw std::runtime_error( - "batch_scheduler_policy parameter was not found or is invalid " - "(must be max_utilization or guaranteed_no_evict)"); - } - } - catch (const std::exception& e) - { - TLLM_LOG_WARNING(e.what()); - } - - if (mIsDecoupled && schedulerPolicy != batch_scheduler::SchedulerPolicy::GUARANTEED_NO_EVICT) - { - TLLM_LOG_WARNING( - "The batch scheduler policy will be set to guaranteed_no_evict" - "since the backend operates in decoupled mode"); - schedulerPolicy = batch_scheduler::SchedulerPolicy::GUARANTEED_NO_EVICT; - } - - std::optional kvCacheFreeGpuMemFraction = std::nullopt; - try - { - kvCacheFreeGpuMemFraction = model_state_->GetParameter("kv_cache_free_gpu_mem_fraction"); - } - catch (const std::exception& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING( - "kv_cache_free_gpu_mem_fraction is not specified, will use default value of 0.85 or " - "max_tokens_in_paged_kv_cache"); - } - - std::optional maxNumSequences = std::nullopt; - try - { - maxNumSequences = model_state_->GetParameter("max_num_sequences"); - } - catch (const std::exception& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("max_num_sequences is not specified, will be set to the TRT engine max_batch_size"); - } - - std::optional enableTrtOverlap = std::nullopt; - try - { - enableTrtOverlap = model_state_->GetParameter("enable_trt_overlap"); - } - catch (const std::exception& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("enable_trt_overlap is not specified, will be set to true"); - } - - TrtGptModelOptionalParams optionalParams( - maxNumSequences, maxTokensInPagedKvCache, kvCacheFreeGpuMemFraction, enableTrtOverlap); - - mBatchManager = std::make_shared( - mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy, - [this](int max_num_requests) { return get_inference_requests(max_num_requests); }, - [this](uint64_t requestId, std::list response_tensors, bool final_response, - const std::string& errMsg) - { return sendResponse(requestId, response_tensors, final_response, errMsg); }, - [this]() { return pollStopSignals(); }, [this](const std::string& s) { return logStats(s); }, - optionalParams); - - if (getCommWorldRank() != 0) - { - while (true) - { - } - } - } - - ModelState* model_state_; - - // - // inflight batcher is a decoupled design. - // It uses response factory objects to decouple responses from requests. - // - // New requests are added to mWorkItems list. This list is processed - // in an infinite loop run by a worker thread. Requests take multiple - // iterations to complete, and number of iterations is not known in - // advance. To facilitate this, we use response factory objects to - // decouple requests and responses. - // - TrtGptModelType mTrtGptModelType; - std::string mModelPath; - bool mIsDecoupled; - - std::shared_ptr mBatchManager; - - WorkItemsQueue mWorkItemsQueue; -}; - -TRITONSERVER_Error* ModelInstanceState::Create( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state) -{ - try - { - *state = new ModelInstanceState(model_state, triton_model_instance); - } - catch (const std::exception& ex) - { - std::string errStr = std::string("unexpected error when creating modelInstanceState: ") + ex.what(); - return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str()); - } - - return nullptr; // success -} - -extern "C" -{ - - // Triton calls TRITONBACKEND_ModelInstanceInitialize when a model - // instance is created to allow the backend to initialize any state - // associated with the instance. - // - TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) - { - // Get the model state associated with this instance's model. - TRITONBACKEND_Model* model; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); - - void* vmodelstate; - RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); - ModelState* model_state = reinterpret_cast(vmodelstate); - - // Create a ModelInstanceState object and associate it with the - // TRITONBACKEND_ModelInstance. - ModelInstanceState* instance_state; - RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state)); - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast(instance_state))); - - return nullptr; // success - } - - // Triton calls TRITONBACKEND_ModelInstanceFinalize when a model - // instance is no longer needed. The backend should cleanup any state - // associated with the model instance. - // - TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) - { - void* vstate; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); - ModelInstanceState* instance_state = reinterpret_cast(vstate); - delete instance_state; - - return nullptr; // success - } - -} // extern "C" - -///////////// - -extern "C" -{ - - // When Triton calls TRITONBACKEND_ModelInstanceExecute it is required - // that a backend create a response for each request in the batch. A - // response may be the output tensors required for that request or may - // be an error that is returned in the response. - // - TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( - TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count) - { - ModelInstanceState* instance_state; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast(&instance_state))); - - auto isDecoupled = instance_state->isDecoupled(); - - instance_state->enqueue(requests, request_count, isDecoupled); - - for (uint32_t r = 0; r < request_count; ++r) - { - TRITONBACKEND_Request* request = requests[r]; - TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL); - } - - return nullptr; // success - } - -} // extern "C" - -} // namespace inflight_batcher_llm -} // namespace backend -} // namespace triton diff --git a/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript b/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript deleted file mode 100644 index 748714d1..00000000 --- a/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -{ - global: - TRITONBACKEND_*; - local: *; -}; diff --git a/inflight_batcher_llm/src/mpiUtils.h b/inflight_batcher_llm/src/mpiUtils.h deleted file mode 100644 index 41dc0a6a..00000000 --- a/inflight_batcher_llm/src/mpiUtils.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#define MPICHECK(cmd) \ - do \ - { \ - int e = cmd; \ - if (e != MPI_SUCCESS) \ - { \ - printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -enum MpiType -{ - MPI_TYPE_BYTE, - MPI_TYPE_CHAR, - MPI_TYPE_INT, - MPI_TYPE_INT64_T, - MPI_TYPE_UINT32_T, - MPI_TYPE_UINT64_T, - MPI_TYPE_UNSIGNED_LONG_LONG, -}; - -inline MPI_Datatype getMpiDtype(MpiType dtype) -{ - static const std::unordered_map dtype_map{ - {MPI_TYPE_BYTE, MPI_BYTE}, - {MPI_TYPE_CHAR, MPI_CHAR}, - {MPI_TYPE_INT, MPI_INT}, - {MPI_TYPE_INT64_T, MPI_INT64_T}, - {MPI_TYPE_UINT32_T, MPI_UINT32_T}, - {MPI_TYPE_UINT64_T, MPI_UINT64_T}, - {MPI_TYPE_UNSIGNED_LONG_LONG, MPI_UNSIGNED_LONG_LONG}, - }; - return dtype_map.at(dtype); -} - -inline int getCommWorldSize() -{ - int size; - MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &size)); - return size; -} - -inline int getCommWorldRank() -{ - int rank; - MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); - return rank; -} - -inline void barrier() -{ - MPICHECK(MPI_Barrier(MPI_COMM_WORLD)); -} - -inline void bcast(void* buffer, size_t size, MpiType dtype, int root) -{ - MPICHECK(MPI_Bcast(buffer, size, getMpiDtype(dtype), root, MPI_COMM_WORLD)); -} - -inline void bcast(std::vector& packed, int root) -{ - MPICHECK(MPI_Bcast(packed.data(), packed.size(), MPI_INT64_T, root, MPI_COMM_WORLD)); -} diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 09b93670..00000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -regex -fire -tritonclient[all] -transformers==4.31.0 -pandas -tabulate diff --git a/scripts/launch_triton_server.py b/scripts/launch_triton_server.py deleted file mode 100644 index 1009a81f..00000000 --- a/scripts/launch_triton_server.py +++ /dev/null @@ -1,32 +0,0 @@ -import argparse -import subprocess -from pathlib import Path - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('--world_size', - type=int, - default=1, - help='world size, only support tensor parallelism now') - parser.add_argument('--tritonserver', - type=str, - default='/opt/tritonserver/bin/tritonserver') - path = str(Path(__file__).parent.absolute()) + '/../all_models/gpt' - parser.add_argument('--model_repo', type=str, default=path) - return parser.parse_args() - - -def get_cmd(world_size, tritonserver, model_repo): - cmd = 'mpirun --allow-run-as-root ' - for i in range(world_size): - cmd += ' -n 1 {} --model-repository={} --disable-auto-complete-config --backend-config=python,shm-region-prefix-name=prefix{}_ : '.format( - tritonserver, model_repo, i) - cmd += '&' - return cmd - - -if __name__ == '__main__': - args = parse_arguments() - cmd = get_cmd(int(args.world_size), args.tritonserver, args.model_repo) - subprocess.call(cmd, shell=True) diff --git a/tensorrt_llm b/tensorrt_llm index 4941ad29..ae8270b7 160000 --- a/tensorrt_llm +++ b/tensorrt_llm @@ -1 +1 @@ -Subproject commit 4941ad29d2680ce67e927dd4ffa204a770af7ebe +Subproject commit ae8270b713446948246f16fadf4e2a32e35d0f62 diff --git a/tools/__init__.py b/tools/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tools/environment_setup.sh b/tools/environment_setup.sh deleted file mode 100644 index 4367dbe1..00000000 --- a/tools/environment_setup.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -apt-get update && apt-get install git-lfs rapidjson-dev python3-pip python-is-python3 -y --no-install-recommends -# Update submodule -git submodule update --init --recursive -git lfs install -(cd tensorrt_llm/cpp/tensorrt_llm/batch_manager && git lfs pull) - -pip3 install -r requirements.txt --extra-index-url https://pypi.ngc.nvidia.com - -# Remove prevous TRT installation -apt-get remove --purge -y tensorrt* libnvinfer* -pip uninstall -y tensorrt - -# Download & install internal TRT release -bash tensorrt_llm/docker/common/install_tensorrt.sh - -export LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:$LD_LIBRARY_PATH -export TRT_ROOT=/usr/local/tensorrt diff --git a/tools/fill_template.py b/tools/fill_template.py deleted file mode 100644 index cb298b31..00000000 --- a/tools/fill_template.py +++ /dev/null @@ -1,38 +0,0 @@ -#! /usr/bin/env python3 -from argparse import ArgumentParser -from string import Template - - -def main(file_path, substitutions, in_place): - with open(file_path) as f: - pbtxt = Template(f.read()) - - sub_dict = {} - for sub in substitutions.split(","): - key, value = sub.split(":") - sub_dict[key] = value - - pbtxt = pbtxt.safe_substitute(sub_dict) - - if in_place: - with open(file_path, "w") as f: - f.write(pbtxt) - else: - print(pbtxt) - - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument("file_path", help="path of the .pbtxt to modify") - parser.add_argument( - "substitutions", - help= - "substitions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..." - ) - parser.add_argument("--in_place", - "-i", - action="http://github.com/store_true", - help="do the operation in-place") - args = parser.parse_args() - - main(**vars(args)) diff --git a/tools/gpt/client.py b/tools/gpt/client.py deleted file mode 100644 index 7bd44af0..00000000 --- a/tools/gpt/client.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import argparse -from datetime import datetime - -import numpy as np -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer -from utils import utils - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument( - '-t', - '--text', - type=str, - required=False, - default='Born in north-east France, Soyer trained as a', - help='Input text') - parser.add_argument('-c', - '--concurrency', - type=int, - default=1, - required=False, - help='Specify concurrency') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument('--tokenizer_dir', - type=str, - required=True, - help='Specify tokenizer directory') - parser.add_argument('--tokenizer_type', - type=str, - default='auto', - required=False, - choices=['auto', 't5', 'llama'], - help='Specify tokenizer type') - - FLAGS = parser.parse_args() - if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print( - "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) - exit(1) - - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - if FLAGS.tokenizer_type == 't5': - tokenizer = T5Tokenizer(vocab_file=FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'auto': - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'llama': - tokenizer = LlamaTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left') - else: - raise AttributeError( - f'Unexpected tokenizer type: {FLAGS.tokenizer_type}') - tokenizer.pad_token = tokenizer.eos_token - pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0] - - line = tokenizer.encode(FLAGS.text) - input_start_ids = np.array([line], np.int32) - input_len = np.array([[len(line)]], np.int32) - inputs = utils.prepare_inputs(input_start_ids, input_len, pad_id, end_id, - FLAGS) - - start_time = datetime.now() - - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - results = utils.send_requests('tensorrt_llm', - inputs, - client, - request_parallelism=1) - output_ids = results[0].as_numpy("output_ids") - - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - print(f"[INFO] Latency: {latency} ms") - - output_ids = output_ids.reshape( - (output_ids.size, )).tolist()[input_start_ids.shape[1]:] - output_text = tokenizer.decode(output_ids) - print(f'Input: {FLAGS.text}') - print(f'Output: {output_text}') diff --git a/tools/gpt/client_async.py b/tools/gpt/client_async.py deleted file mode 100644 index f20c1458..00000000 --- a/tools/gpt/client_async.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import argparse -from datetime import datetime - -import numpy as np -import tritonclient.grpc as grpcclient -import tritonclient.http as httpclient -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer -from utils import utils - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument( - '-t', - '--text', - type=str, - required=False, - default='Born in north-east France, Soyer trained as a', - help='Input text') - parser.add_argument('-c', - '--concurrency', - type=int, - default=1, - required=False, - help='Specify concurrency') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument('--tokenizer_dir', - type=str, - required=True, - help='Specify tokenizer directory') - parser.add_argument('--tokenizer_type', - type=str, - default='auto', - required=False, - choices=['auto', 't5', 'llama'], - help='Specify tokenizer type') - - FLAGS = parser.parse_args() - if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print( - "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) - exit(1) - - client_util = httpclient if FLAGS.protocol == "http" else grpcclient - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - if FLAGS.tokenizer_type == 't5': - tokenizer = T5Tokenizer(vocab_file=FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'auto': - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'llama': - tokenizer = LlamaTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left') - else: - raise AttributeError( - f'Unexpected tokenizer type: {FLAGS.tokenizer_type}') - tokenizer.pad_token = tokenizer.eos_token - pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0] - - line = tokenizer.encode(FLAGS.text) - input_start_ids = np.array([line], np.int32) - input_len = np.array([[len(line)]], np.int32) - inputs = utils.prepare_inputs(input_start_ids, input_len, pad_id, end_id, - FLAGS) - - start_time = datetime.now() - - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - if FLAGS.protocol == "http": - async_requests = utils.send_requests_async('tensorrt_llm', - inputs, - client, - FLAGS, - request_parallelism=1) - results = utils.get_http_results(async_requests) - else: - user_data = utils.send_requests_async('tensorrt_llm', - inputs, - client, - FLAGS, - request_parallelism=1) - results = utils.get_grpc_results(user_data, request_parallelism=1) - output_ids = results[0].as_numpy("output_ids") - - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - print(f"[INFO] Latency: {latency} ms") - - output_ids = output_ids.reshape( - (output_ids.size, )).tolist()[input_start_ids.shape[1]:] - output_text = tokenizer.decode(output_ids) - print(f'Input: {FLAGS.text}') - print(f'Output: {output_text}') diff --git a/tools/gpt/end_to_end_test.py b/tools/gpt/end_to_end_test.py deleted file mode 100644 index 809a9e86..00000000 --- a/tools/gpt/end_to_end_test.py +++ /dev/null @@ -1,264 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import argparse - -import numpy as np -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer -from utils import utils - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument('-c', - '--concurrency', - type=int, - default=1, - required=False, - help='Specify concurrency') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument('--tokenizer_dir', - type=str, - required=True, - help='Specify tokenizer directory') - parser.add_argument('--tokenizer_type', - type=str, - default='auto', - required=False, - choices=['auto', 't5', 'llama'], - help='Specify tokenizer type') - - FLAGS = parser.parse_args() - if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print( - "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) - exit(1) - - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - if FLAGS.tokenizer_type == 't5': - tokenizer = T5Tokenizer(vocab_file=FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'auto': - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'llama': - tokenizer = LlamaTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left') - else: - raise AttributeError( - f'Unexpected tokenizer type: {FLAGS.tokenizer_type}') - tokenizer.pad_token = tokenizer.eos_token - pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0] - - model_name = 'preprocessing' - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - input0 = [["Blackhawks\n The 2015 Hilltoppers"], - ["Data sources you can use to make a decision:"], - ["\n if(angle = 0) { if(angle"], - ["GMs typically get 78% female enrollment, but the "], - ["Previous Chapter | Index | Next Chapter"], - ["Michael, an American Jew, called Jews"], - ["Born in north-east France, Soyer trained as a"], - ["Data sources you can use to make a comparison:"]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * FLAGS.output_len - bad_words_list = np.array( - [["Hawks, Hawks"], [""], [""], [""], [""], [""], [""], [""]], - dtype=object) - stop_words_list = np.array( - [[""], [""], [""], [""], [""], [""], [""], ["month, month"]], - dtype=object) - inputs = [ - utils.prepare_tensor("QUERY", input0_data, FLAGS.protocol), - utils.prepare_tensor("BAD_WORDS_DICT", bad_words_list, - FLAGS.protocol), - utils.prepare_tensor("STOP_WORDS_DICT", stop_words_list, - FLAGS.protocol), - utils.prepare_tensor("REQUEST_OUTPUT_LEN", output0_len, - FLAGS.protocol), - ] - - try: - result = client.infer(model_name, inputs) - output0 = result.as_numpy("INPUT_ID") - output1 = result.as_numpy("REQUEST_INPUT_LEN") - output2 = result.as_numpy("REQUEST_OUTPUT_LEN") - output3 = result.as_numpy("BAD_WORDS_IDS") - output4 = result.as_numpy("STOP_WORDS_IDS") - except Exception as e: - print(e) - - model_name = "tensorrt_llm" - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=1, - verbose=FLAGS.verbose) as client: - inputs = utils.prepare_inputs(output0, output1, pad_id, end_id, FLAGS) - - try: - result = client.infer(model_name, inputs) - output0 = result.as_numpy("output_ids") - except Exception as e: - print(e) - - model_name = "postprocessing" - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - inputs = [ - utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol) - ] - inputs[0].set_data_from_numpy(output0) - - try: - result = client.infer(model_name, inputs) - output0 = result.as_numpy("OUTPUT") - print("============After postprocessing============") - batch_size = len(input0) - output0 = output0.reshape([-1, batch_size]).T.tolist() - output0 = [[char.decode('UTF-8') for char in line] - for line in output0] - output0 = [''.join(line) for line in output0] - for line in output0: - print(f"{line}") - print("===========================================\n\n\n") - except Exception as e: - print(e) - - model_name = "ensemble" - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - input0 = [["Blackhawks\n The 2015 Hilltoppers"], - ["Data sources you can use to make a decision:"], - ["\n if(angle = 0) { if(angle"], - ["GMs typically get 78% female enrollment, but the "], - ["Previous Chapter | Index | Next Chapter"], - ["Michael, an American Jew, called Jews"], - ["Born in north-east France, Soyer trained as a"], - ["Data sources you can use to make a comparison:"]] - bad_words_list = np.array( - [["Hawks, Hawks"], [""], [""], [""], [""], [""], [""], [""]], - dtype=object) - stop_words_list = np.array( - [[""], [""], [""], [""], [""], [""], [""], ["month, month"]], - dtype=object) - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * FLAGS.output_len - runtime_top_k = (FLAGS.topk * - np.ones([input0_data.shape[0], 1])).astype(np.uint32) - runtime_top_p = FLAGS.topp * np.ones([input0_data.shape[0], 1]).astype( - np.float32) - temperature = 1.0 * np.ones([input0_data.shape[0], 1]).astype( - np.float32) - len_penalty = 1.0 * np.ones([input0_data.shape[0], 1]).astype( - np.float32) - repetition_penalty = 1.0 * np.ones([input0_data.shape[0], 1]).astype( - np.float32) - random_seed = 0 * np.ones([input0_data.shape[0], 1]).astype(np.uint64) - output_log_probs = True * np.ones([input0_data.shape[0], 1 - ]).astype(bool) - beam_width = (FLAGS.beam_width * - np.ones([input0_data.shape[0], 1])).astype(np.uint32) - pad_ids = pad_id * \ - np.ones([input0_data.shape[0], 1]).astype(np.uint32) - end_ids = end_id * \ - np.ones([input0_data.shape[0], 1]).astype(np.uint32) - min_length = 1 * \ - np.ones([input0_data.shape[0], 1]).astype(np.uint32) - presence_penalty = 0.0 * \ - np.ones([input0_data.shape[0], 1]).astype(np.float32) - inputs = [ - utils.prepare_tensor("text_input", input0_data, FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, - FLAGS.protocol), - utils.prepare_tensor("pad_id", pad_ids, FLAGS.protocol), - utils.prepare_tensor("end_id", end_ids, FLAGS.protocol), - utils.prepare_tensor("beam_width", beam_width, FLAGS.protocol), - utils.prepare_tensor("top_k", runtime_top_k, FLAGS.protocol), - utils.prepare_tensor("top_p", runtime_top_p, FLAGS.protocol), - utils.prepare_tensor("temperature", temperature, FLAGS.protocol), - utils.prepare_tensor("length_penalty", len_penalty, - FLAGS.protocol), - utils.prepare_tensor("repetition_penalty", repetition_penalty, - FLAGS.protocol), - utils.prepare_tensor("min_length", min_length, FLAGS.protocol), - utils.prepare_tensor("presence_penalty", presence_penalty, - FLAGS.protocol), - utils.prepare_tensor("random_seed", random_seed, FLAGS.protocol), - utils.prepare_tensor("output_log_probs", output_log_probs, - FLAGS.protocol), - ] - - try: - result = client.infer(model_name, inputs) - ensemble_output0 = result.as_numpy("text_output") - print("============After ensemble============") - batch_size = len(input0) - ensemble_output0 = ensemble_output0.reshape([-1, batch_size - ]).T.tolist() - ensemble_output0 = [[char.decode('UTF-8') for char in line] - for line in ensemble_output0] - ensemble_output0 = [''.join(line) for line in ensemble_output0] - for line in ensemble_output0: - print(f"{line}") - except Exception as e: - print(e) - - assert output0 == ensemble_output0 diff --git a/tools/gpt/gen_input_data.py b/tools/gpt/gen_input_data.py deleted file mode 100644 index 00a29dcb..00000000 --- a/tools/gpt/gen_input_data.py +++ /dev/null @@ -1,108 +0,0 @@ -import argparse -import json - -import numpy as np - - -def add_sample(sample, name, array): - sample[name] = {'content': array.flatten().tolist(), 'shape': array.shape} - - -def main(args): - data = {'data': []} - input_start_ids = np.random.randint(0, - 50255, - size=(args.start_len), - dtype=np.int32) - input_len = np.array([input_start_ids.shape[0]], np.int32) - output_len = np.ones([1]).astype(np.uint32) * args.output_len - runtime_top_k = (args.topk * np.ones([1])).astype(np.uint32) - runtime_top_p = args.topp * np.ones([1]).astype(np.float32) - beam_search_diversity_rate = 0.0 * np.ones([1]).astype(np.float32) - temperature = 1.0 * np.ones([1]).astype(np.float32) - len_penalty = 1.0 * np.ones([1]).astype(np.float32) - repetition_penalty = 1.0 * np.ones([1]).astype(np.float32) - random_seed = 0 * np.ones([1]).astype(np.uint64) - # is_return_log_probs = True * np.ones([1]).astype(bool) - beam_width = (args.beam_width * np.ones([1])).astype(np.uint32) - # start_ids = 50256 * np.ones([1]).astype(np.uint32) - # end_ids = 50256 * np.ones([1]).astype(np.uint32) - # bad_words_list = np.concatenate([ - # np.zeros([1, 1]).astype(np.int32), - # (-1 * np.ones([1, 1])).astype(np.int32) - # ], - # axis=1) - # stop_word_list = np.concatenate([ - # np.zeros([1, 1]).astype(np.int32), - # (-1 * np.ones([1, 1])).astype(np.int32) - # ], - # axis=1) - - for _ in range(args.num_samples): - sample = {} - add_sample(sample, 'input_ids', input_start_ids) - add_sample(sample, 'input_lengths', input_len) - add_sample(sample, 'request_output_len', output_len) - add_sample(sample, 'runtime_top_k', runtime_top_k) - add_sample(sample, 'runtime_top_p', runtime_top_p) - add_sample(sample, 'beam_search_diversity_rate', - beam_search_diversity_rate) - add_sample(sample, 'temperature', temperature) - add_sample(sample, 'len_penalty', len_penalty) - add_sample(sample, 'repetition_penalty', repetition_penalty) - add_sample(sample, 'random_seed', random_seed) - add_sample(sample, 'beam_width', beam_width) - # add_sample(sample, 'top_p_decay', top_p_decay) - # add_sample(sample, 'top_p_min', top_p_min) - # add_sample(sample, 'top_p_reset_ids', top_p_reset_ids) - data['data'].append(sample) - - with open('input_data.json', 'w') as f: - json.dump(data, f, indent=4) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-b', - '--batch_size', - type=int, - default=8, - required=False, - help='Specify batch size') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-s', - '--start_len', - type=int, - default=8, - required=False, - help='Specify input length') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument('--num_samples', - type=int, - default=10000, - required=False, - help='Specify number of samples to generate') - args = parser.parse_args() - main(args) diff --git a/tools/gpt/identity_test.py b/tools/gpt/identity_test.py deleted file mode 100644 index fdb4e93a..00000000 --- a/tools/gpt/identity_test.py +++ /dev/null @@ -1,178 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import argparse -import statistics as s -from builtins import range -from datetime import datetime - -import numpy as np -from utils import utils - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument('-w', - '--warm_up', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable warm_up before benchmark') - parser.add_argument('-c', - '--concurrency', - type=int, - default=1, - required=False, - help='Specify concurrency') - parser.add_argument('-p', - '--request_parallelism', - type=int, - default=10, - required=False, - help='Specify request parallelism') - parser.add_argument('-m', - '--mode', - type=str, - required=False, - default='sync', - help='Mode ("sync"/"async").') - parser.add_argument('-b', - '--batch_size', - type=int, - default=8, - required=False, - help='Specify batch size') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-s', - '--start_len', - type=int, - default=8, - required=False, - help='Specify input length') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument( - '-n', - '--num_runs', - type=int, - default=1, - required=False, - help="Spedifty number of runs to get the average latency") - - FLAGS = parser.parse_args() - if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print( - "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) - exit(1) - - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - input_start_ids = np.random.randint(0, - 50255, - size=(FLAGS.batch_size, - FLAGS.start_len), - dtype=np.int32) - input_len = np.array([[input_start_ids.shape[1]] - for _ in range(input_start_ids.shape[0])], np.int32) - inputs = utils.prepare_inputs(input_start_ids, - input_len, - pad_id=0, - end_id=2, - flags=FLAGS) - - # warm up - if FLAGS.warm_up: - print("[INFO] sending requests to warm up") - with utils.create_inference_server_client( - FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - utils.send_requests('tensorrt_llm', - inputs, - client, - request_parallelism=2) - - latencies = [] - for i in range(FLAGS.num_runs): - start_time = datetime.now() - - with utils.create_inference_server_client( - FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - if FLAGS.mode == 'sync': - utils.send_requests('tensorrt_llm', inputs, client, - FLAGS.request_parallelism) - else: - if FLAGS.protocol == "http": - async_requests = utils.send_requests_async( - 'tensorrt_llm', inputs, client, FLAGS, - FLAGS.request_parallelism) - results = utils.get_http_results(async_requests) - else: - user_data = utils.send_requests_async( - 'tensorrt_llm', inputs, client, FLAGS, - FLAGS.request_parallelism) - results = utils.get_grpc_results(user_data, - FLAGS.request_parallelism) - - stop_time = datetime.now() - latencies.append((stop_time - start_time).total_seconds() * 1000.0 / - FLAGS.request_parallelism) - - if FLAGS.num_runs > 1: - latency = s.mean(latencies) - else: - latency = latencies[0] - latency = round(latency, 3) - throughput = round(1000 / latency * FLAGS.batch_size, 3) - print( - f"[INFO] Batch size: {FLAGS.batch_size}, Start len: {FLAGS.start_len}, Output len: {FLAGS.output_len}" - ) - print(f"[INFO] Latency: {latency} ms") - print(f"[INFO] Throughput: {throughput} sentences / sec") diff --git a/tools/inflight_batcher_llm/end_to_end_streaming_client.py b/tools/inflight_batcher_llm/end_to_end_streaming_client.py deleted file mode 100644 index faa5e6b9..00000000 --- a/tools/inflight_batcher_llm/end_to_end_streaming_client.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/python - -import os -import sys -from functools import partial - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - -import argparse -import queue -import sys - -import numpy as np -import tritonclient.grpc as grpcclient -from tritonclient.utils import InferenceServerException -from utils import utils - - -class UserData: - - def __init__(self): - self._completed_requests = queue.Queue() - - -def callback(user_data, result, error): - if error: - user_data._completed_requests.put(error) - else: - user_data._completed_requests.put(result) - output = result.as_numpy('text_output') - print(output[0], flush=True) - - -def test(triton_client, prompt): - model_name = "ensemble" - - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * FLAGS.output_len - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - streaming = [[FLAGS.streaming]] - streaming_data = np.array(streaming, dtype=bool) - - inputs = [ - utils.prepare_tensor("text_input", input0_data, FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, FLAGS.protocol), - utils.prepare_tensor("stream", streaming_data, FLAGS.protocol), - ] - - user_data = UserData() - # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) - # Send request - triton_client.async_stream_infer(model_name, inputs) - - #Wait for server to close the stream - triton_client.stop_stream() - - # Parse the responses - while True: - try: - result = user_data._completed_requests.get(block=False) - except Exception: - break - - if type(result) == InferenceServerException: - print("Received an error from server:") - print(result) - else: - result.as_numpy('text_output') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - - parser.add_argument('-p', - '--prompt', - type=str, - required=True, - help='Input prompt.') - parser.add_argument( - "-S", - "--streaming", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable streaming mode. Default is False.", - ) - - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='grpc', - choices=['grpc'], - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - - parser.add_argument('-o', - '--output_len', - type=int, - default=100, - required=False, - help='Specify output length') - - FLAGS = parser.parse_args() - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - try: - client = grpcclient.InferenceServerClient(url=FLAGS.url) - except Exception as e: - print("client creation failed: " + str(e)) - sys.exit(1) - - test(client, FLAGS.prompt) diff --git a/tools/inflight_batcher_llm/end_to_end_test.py b/tools/inflight_batcher_llm/end_to_end_test.py deleted file mode 100644 index adde47ec..00000000 --- a/tools/inflight_batcher_llm/end_to_end_test.py +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - -import argparse -import json -import sys -from datetime import datetime -from functools import partial - -import numpy as np -from utils import utils - - -def callback(user_data, start_time, result, error): - user_data._completed_requests.put((result, error)) - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - user_data._latencies.append(latency) - - -def test_functionality(client, prompts, output_lens): - print(f"[INFO] Start testing on {len(prompts)} prompts.") - for i, prompt in enumerate(prompts): - - # 1. Ensemble models manually: preprocessing -> tensorrt_llm -> postprocessing - model_name = 'preprocessing' - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * output_lens[i] - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - - inputs = [ - utils.prepare_tensor("QUERY", input0_data, FLAGS.protocol), - utils.prepare_tensor("BAD_WORDS_DICT", bad_words_list, - FLAGS.protocol), - utils.prepare_tensor("STOP_WORDS_DICT", stop_words_list, - FLAGS.protocol), - utils.prepare_tensor("REQUEST_OUTPUT_LEN", output0_len, - FLAGS.protocol), - ] - result = client.infer(model_name, inputs, request_id=str(i)) - output0 = result.as_numpy("INPUT_ID") - output1 = result.as_numpy("REQUEST_INPUT_LEN") - output2 = result.as_numpy("REQUEST_OUTPUT_LEN") - - model_name = "tensorrt_llm" - inputs = [ - utils.prepare_tensor("input_ids", output0, FLAGS.protocol), - utils.prepare_tensor("input_lengths", output1, FLAGS.protocol), - utils.prepare_tensor("request_output_len", output2, - FLAGS.protocol), - ] - result = client.infer(model_name, inputs, request_id=str(i)) - output0 = result.as_numpy("output_ids") - - model_name = "postprocessing" - inputs = [ - utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol) - ] - inputs[0].set_data_from_numpy(output0) - - result = client.infer(model_name, inputs, request_id=str(i)) - output0 = result.as_numpy("OUTPUT") - - # 2. Use ensemble model - model_name = "ensemble" - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * output_lens[i] - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - - inputs = [ - utils.prepare_tensor("text_input", input0_data, FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, - FLAGS.protocol), - ] - - result = client.infer(model_name, inputs, request_id=str(i)) - - # 3. Check the results between manually ensembled models and the ensemble model - ensemble_output = result.as_numpy('text_output') - assert output0 == ensemble_output - if FLAGS.verbose: - print('Response: {}'.format(result.get_response())) - print('Output: {}'.format(ensemble_output)) - print(f"[INFO] Functionality test succeed.") - - -def test_performance(client, prompts, output_lens): - model_name = "ensemble" - - print(f"[INFO] Warm up for benchmarking.") - for i in range(10): - input0 = [[prompts[0]]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * output_lens[i] - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - - inputs = [ - utils.prepare_tensor("text_input", input0_data, FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, - FLAGS.protocol), - ] - - client.infer(model_name, inputs, request_id=str(i)) - - print(f"[INFO] Start benchmarking on {len(prompts)} prompts.") - latency = 0 - async_requests = [] - start_time = datetime.now() - user_data = utils.UserData() - for i, prompt in enumerate(prompts): - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * output_lens[i] - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - - inputs = [ - utils.prepare_tensor("text_input", input0_data, FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, - FLAGS.protocol), - ] - - if FLAGS.protocol == "http": - async_requests.append( - client.async_infer(model_name, inputs, request_id=str(i))) - elif FLAGS.protocol == "grpc": - async_requests.append( - client.async_infer(model_name, - inputs, - callback=partial(callback, user_data, - datetime.now()), - request_id=str(i))) - - if FLAGS.protocol == "http": - utils.get_http_results(async_requests) - elif FLAGS.protocol == "grpc": - utils.get_grpc_results(user_data, len(prompts)) - else: - raise RuntimeError("Invalid protocol") - - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - print(f"[INFO] Total Latency: {latency} ms") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - choices=['http', 'grpc'], - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument('-c', - '--concurrency', - type=int, - default=128, - required=False, - help='Specify concurrency') - parser.add_argument('--max_input_len', - type=int, - required=True, - help='Specify max input length') - - parser.add_argument('--dataset', - type=str, - required=True, - help='Dataset path used for the test.') - - FLAGS = parser.parse_args() - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - try: - client = utils.create_inference_server_client( - FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) - except Exception as e: - print("channel creation failed: " + str(e)) - sys.exit(1) - - prompts = [] - output_lens = [] - with open(FLAGS.dataset, 'r') as f: - data_dict = json.load(f) - for req in data_dict: - prompt = req['input'] + ' ' + req['instruction'] - output = req['output'] - # 1.3 is a magic number that converts number of words to number of tokens - if int(len(prompt.split(' ')) / 1.3) > FLAGS.max_input_len: - continue - prompts.append(prompt) - # 1.3 is a magic number that converts number of words to number of tokens - output_lens.append(int(len(output.split(' ')) * 1.3)) - - test_functionality(client, prompts, output_lens) - test_performance(client, prompts, output_lens) diff --git a/tools/inflight_batcher_llm/identity_test.py b/tools/inflight_batcher_llm/identity_test.py deleted file mode 100644 index 10ebc625..00000000 --- a/tools/inflight_batcher_llm/identity_test.py +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - -import argparse -import json -import sys -import time -from datetime import datetime -from functools import partial - -import numpy as np -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer -from utils import utils - - -def callback(user_data, start_time, req_id, result, error): - user_data._completed_requests.put((result, error)) - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - user_data._latencies.append(latency) - user_data._latency_dict[req_id] = latency - user_data._start_time_dict[req_id] = start_time - user_data._stop_time_dict[req_id] = stop_time - - -def test_performance(client, input_start_ids, input_lens, output_lens, FLAGS): - model_name = "tensorrt_llm" - - print(f"[INFO] Warm up for benchmarking.") - for i in range(10): - output0_len = np.ones_like([[1]]).astype(np.uint32) * 100 - inputs = [ - utils.prepare_tensor("input_ids", input_start_ids[0], - FLAGS.protocol), - utils.prepare_tensor("input_lengths", input_lens[i], - FLAGS.protocol), - utils.prepare_tensor("request_output_len", output0_len, - FLAGS.protocol), - ] - client.infer(model_name, inputs, request_id=str(i)) - - print(f"[INFO] Start benchmarking on {len(input_start_ids)} prompts.") - latency = 0 - async_requests = [] - start_time = datetime.now() - user_data = utils.UserData() - for i, ids in enumerate(input_start_ids): - output0_len = np.ones_like([[1]]).astype(np.uint32) * output_lens[i] - inputs = [ - utils.prepare_tensor("input_ids", ids, FLAGS.protocol), - utils.prepare_tensor("input_lengths", input_lens[i], - FLAGS.protocol), - utils.prepare_tensor("request_output_len", output0_len, - FLAGS.protocol), - ] - - if len(FLAGS.time_bet_reqs) == 1: - time.sleep(FLAGS.time_bet_reqs[0]) - else: - time.sleep(FLAGS.time_bet_reqs[i % len(FLAGS.time_bet_reqs)]) - - if FLAGS.protocol == "http": - async_requests.append( - client.async_infer(model_name, inputs, request_id=str(i))) - elif FLAGS.protocol == "grpc": - async_requests.append( - client.async_infer(model_name, - inputs, - callback=partial(callback, user_data, - datetime.now(), i), - request_id=str(i))) - - try: - if FLAGS.protocol == "http": - utils.get_http_results(async_requests) - elif FLAGS.protocol == "grpc": - responses = utils.get_grpc_results(user_data, len(input_start_ids)) - else: - raise RuntimeError("Invalid protocol") - - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - print(f"[INFO] Total Latency: {latency} ms") - - # TODO(kaiyu): support `extract_print_stats` for http - if FLAGS.protocol == "grpc": - request_latencies = 0.0 - for latency in user_data._latencies: - request_latencies += latency - print(f"[INFO] Total request latencies: {request_latencies} ms") - - ip_token_len_list = [] - for ip in input_lens: - ip_token_len_list.append( - ip[0][0]) #for some reason, two level nesting - - utils.extract_print_stats(ip_token_len_list, responses, user_data, - FLAGS) - - except Exception as e: - print("Failed receiving responses: " + str(e)) - sys.exit(1) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - choices=['http', 'grpc'], - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument('-c', - '--concurrency', - type=int, - default=128, - required=False, - help='Specify concurrency') - parser.add_argument('--max_input_len', - type=int, - required=True, - help='Specify max input length') - - parser.add_argument('--dataset', - type=str, - required=True, - help='Dataset path used for the test.') - parser.add_argument('--tokenizer_dir', - type=str, - required=True, - help='Specify tokenizer directory') - parser.add_argument('--tokenizer_type', - type=str, - default='auto', - required=False, - choices=['auto', 't5', 'llama'], - help='Specify tokenizer type') - parser.add_argument( - '--time_bet_reqs', - type=float, - required=False, - nargs='+', - help="Input time(s) in (secs) bet requests separated by spaces", - default=[0]) - parser.add_argument( - '--dump_perfetto_trace', - action="http://github.com/store_true", - required=False, - default=False, - help= - 'Dumps trace of requests in a json (perfetto.json) to be visualized in perfetto' - ), - parser.add_argument('--op_stats_csv', - type=str, - default=None, - help='csv filename to dump stats'), - parser.add_argument( - '--op_tokens_per_word', - type=float, - default=1.3, - required=False, - help= - 'Specify op tokens/word ratio. Useful to have model generate as many number of words as in dataset' - ) - - FLAGS = parser.parse_args() - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - try: - client = utils.create_inference_server_client( - FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) - except Exception as e: - print("channel creation failed: " + str(e)) - sys.exit(1) - - if FLAGS.tokenizer_type == 't5': - tokenizer = T5Tokenizer(vocab_file=FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'auto': - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - padding_side='left') - elif FLAGS.tokenizer_type == 'llama': - tokenizer = LlamaTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left') - else: - raise AttributeError( - f'Unexpected tokenizer type: {FLAGS.tokenizer_type}') - tokenizer.pad_token = tokenizer.eos_token - - input_start_ids = [] - input_lens = [] - output_lens = [] - ratio = [] - with open(FLAGS.dataset, 'r') as f: - data_dict = json.load(f) - for req in data_dict: - prompt = req['input'] + ' ' + req['instruction'] - output = req['output'] - line = tokenizer.encode(prompt) - if len(line) > FLAGS.max_input_len: - continue - input_start_ids.append(np.array([line], np.int32)) - input_lens.append(np.array([[len(line)]], np.int32)) - output_lens.append( - int(len(output.split(' ')) * FLAGS.op_tokens_per_word)) - prompt_tokens = len(line) - prompt_words = len(prompt.split()) - ratio.append(prompt_tokens / prompt_words) - - print("Tokens per word: ", round(np.mean(ratio), 3)) - test_performance(client, input_start_ids, input_lens, output_lens, FLAGS) - print("Expected op tokens", round(np.mean(output_lens), 3)) diff --git a/tools/utils.sh b/tools/utils.sh deleted file mode 100644 index 042f1e5a..00000000 --- a/tools/utils.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on -# success, 1 on failure -function wait_for_server_ready() { - local spid="$1"; shift - local wait_time_secs="${1:-30}"; shift - - WAIT_RET=0 - - local wait_secs=$wait_time_secs - until test $wait_secs -eq 0 ; do - if ! kill -0 $spid; then - echo "=== Server not running." - WAIT_RET=1 - return - fi - - sleep 1; - - set +e - code=`curl -s -w %{http_code} localhost:8000/v2/health/ready` - set -e - if [ "$code" == "200" ]; then - return - fi - - ((wait_secs--)); - done - - echo "=== Timeout $wait_time_secs secs. Server not ready." - WAIT_RET=1 -} diff --git a/tools/utils/__init__.py b/tools/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tools/utils/utils.py b/tools/utils/utils.py deleted file mode 100644 index 4683b72d..00000000 --- a/tools/utils/utils.py +++ /dev/null @@ -1,345 +0,0 @@ -import csv -import json -import math -import queue -from datetime import timedelta -from functools import partial - -import numpy as np -import pandas as pd -import tritonclient.grpc as grpcclient -import tritonclient.http as httpclient -from tabulate import tabulate -from tritonclient.utils import np_to_triton_dtype - - -class UserData: - - def __init__(self): - self._completed_requests = queue.Queue() - self._latencies = [] - self._latency_dict = {} - self._start_time_dict = {} - self._stop_time_dict = {} - - -# Callback function used for async_stream_infer() -def completion_callback(user_data, result, error): - # passing error raise and handling out - user_data._completed_requests.put((result, error)) - - -def prepare_tensor(name, input, protocol): - client_util = httpclient if protocol == "http" else grpcclient - t = client_util.InferInput(name, input.shape, - np_to_triton_dtype(input.dtype)) - t.set_data_from_numpy(input) - return t - - -def prepare_inputs(input_start_ids, input_len, pad_id, end_id, flags): - output_len = np.ones([input_start_ids.shape[0], 1]).astype( - np.uint32) * flags.output_len - runtime_top_k = (flags.topk * - np.ones([input_start_ids.shape[0], 1])).astype(np.uint32) - runtime_top_p = flags.topp * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - beam_search_diversity_rate = 0.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - temperature = 1.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - len_penalty = 1.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - repetition_penalty = 1.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - random_seed = 0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.uint64) - output_log_probs = True * \ - np.ones([input_start_ids.shape[0], 1]).astype(bool) - beam_width = (flags.beam_width * - np.ones([input_start_ids.shape[0], 1])).astype(np.uint32) - pad_ids = pad_id * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.uint32) - end_ids = end_id * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.uint32) - min_length = 1 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.uint32) - presence_penalty = 0.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - bad_words_list = np.concatenate([ - np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32), - (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32) - ], - axis=1) - stop_word_list = np.concatenate([ - np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32), - (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32) - ], - axis=1) - inputs = [ - prepare_tensor("input_ids", input_start_ids, flags.protocol), - prepare_tensor("input_lengths", input_len, flags.protocol), - prepare_tensor("request_output_len", output_len, flags.protocol), - prepare_tensor("pad_id", pad_ids, flags.protocol), - prepare_tensor("end_id", end_ids, flags.protocol), - prepare_tensor("beam_width", beam_width, flags.protocol), - prepare_tensor("temperature", temperature, flags.protocol), - prepare_tensor("runtime_top_k", runtime_top_k, flags.protocol), - prepare_tensor("runtime_top_p", runtime_top_p, flags.protocol), - prepare_tensor("len_penalty", len_penalty, flags.protocol), - prepare_tensor("repetition_penalty", repetition_penalty, - flags.protocol), - prepare_tensor("min_length", min_length, flags.protocol), - prepare_tensor("presence_penalty", presence_penalty, flags.protocol), - prepare_tensor("random_seed", random_seed, flags.protocol), - prepare_tensor("output_log_probs", output_log_probs, flags.protocol), - # prepare_tensor("bad_words_list", bad_words_list, flags.protocol), - # prepare_tensor("stop_words_list", stop_word_list, flags.protocol), - ] - return inputs - - -def create_inference_server_client(protocol, url, concurrency, verbose): - client_util = httpclient if protocol == "http" else grpcclient - if protocol == "http": - return client_util.InferenceServerClient(url, - concurrency=concurrency, - verbose=verbose) - elif protocol == "grpc": - return client_util.InferenceServerClient(url, verbose=verbose) - - -def send_requests(model_name, inputs, client, request_parallelism): - results = [] - for _ in range(request_parallelism): - result = client.infer(model_name, inputs) - results.append(result) - return results - - -def send_requests_async(model_name, inputs, client, flags, - request_parallelism): - if flags.protocol == "http": - async_requests = [] - for _ in range(request_parallelism): - async_requests.append(client.async_infer(model_name, inputs)) - return async_requests - else: - user_data = UserData() - for _ in range(request_parallelism): - client.async_infer(model_name, inputs, - partial(completion_callback, user_data)) - return user_data - - -def get_http_results(async_requests): - results = [] - for async_request in async_requests: - results.append(async_request.get_result()) - return results - - -def get_grpc_results(user_data, request_parallelism): - results = [] - processed_count = 0 - while processed_count < request_parallelism: - (result, error) = user_data._completed_requests.get() - processed_count += 1 - if error is not None: - raise RuntimeError(error) - results.append(result) - return results - - -def append_start_and_end_ids(inputs, - batch_size, - flags, - start_id=None, - end_id=None): - if start_id is not None: - start_ids = start_id * np.ones([batch_size, 1]).astype(np.uint32) - inputs.append(prepare_tensor("start_id", start_ids, flags.protocol)) - if end_id is not None: - end_ids = end_id * np.ones([batch_size, 1]).astype(np.uint32) - inputs.append(prepare_tensor("end_id", end_ids, flags.protocol)) - - -def get_inflight_reqs_profile(start_times, end_times, requests_per_sec): - """ - Receives start and end times of all requests, - divides total E2E time into equal intervals and assigns how many requests are in flight - in each interval. - """ - # Calculate min of start time and max of end time - min_start_time = min(start_times) - max_end_time = max(end_times) - - # need to have enough resolution intervals depending on avg. latency per request. 10 times smaller than request processing time - sec_per_request = 1.0 / requests_per_sec - NUM_INTERVALS = int((max_end_time - min_start_time) / - timedelta(seconds=(sec_per_request / 10))) - print(NUM_INTERVALS) - # Calculate interval length - interval_length = (max_end_time - min_start_time) / NUM_INTERVALS - - # Initialize a list to store the count of requests in each interval - interval_counts = [0] * NUM_INTERVALS - - # Iterate through the requests and update interval counts - for i in range(len(start_times)): - start = start_times[i] - end = end_times[i] - - # Calculate which interval the request falls into - interval_index = int((start - min_start_time) / interval_length) - - # Increment the count for that interval and subsequent intervals until end - while start < end and interval_index < NUM_INTERVALS: - interval_counts[interval_index] += 1 - interval_index += 1 - start += interval_length - - return interval_counts - - -def extract_print_stats(ip_token_len_list, responses, user_data, FLAGS): - - #### Gather info about requests - op_token_len_list = [] - op_token_len_ooo = {} - - for response in responses: - #JG: long sequence to extract output length from response json dict. Responses are out of order - op_token_len_ooo[response.get_response(as_json=True)['id']] = \ - int(response.get_response(as_json=True)['outputs'][0]['shape'][2]) - - op_token_len_list = [ - value for key, value in sorted(op_token_len_ooo.items()) - ] - - assert (len(op_token_len_list) == len(ip_token_len_list)) - for i in range(len(op_token_len_list)): - op_token_len_list[i] = op_token_len_list[i] - ip_token_len_list[i] - - # Get latencies per request - # Order latencies based on issue order. - latency_list_in_order = [ - value for key, value in sorted(user_data._latency_dict.items()) - ] - start_time_list_in_order = [ - value for key, value in sorted(user_data._start_time_dict.items()) - ] - stop_time_list_in_order = [ - value for key, value in sorted(user_data._stop_time_dict.items()) - ] - - latency_sorted = np.sort(latency_list_in_order) - index_99 = math.ceil(len(latency_sorted) * 0.99) - index_90 = math.ceil(len(latency_sorted) * 0.90) - - data = { - 'latency': latency_list_in_order, - 'start_time': start_time_list_in_order, - 'stop_time': stop_time_list_in_order, - 'num_ip_tokens': ip_token_len_list, - 'num_op_tokens': op_token_len_list - } - - # Bundle everything in a single DF - df = pd.DataFrame(data) - - #stats - df['num_ip_tokens'].sum() - avg_ip_tokens = df['num_ip_tokens'].mean() - df['num_ip_tokens'].median() - df['num_ip_tokens'].std() - total_op_tokens = df['num_op_tokens'].sum() - avg_op_tokens = df['num_op_tokens'].mean() - df['num_op_tokens'].median() - df['num_op_tokens'].std() - - tend = max(df['stop_time'].tolist()) - t0 = min(df['start_time'].tolist()) - total_latency = (tend - t0).total_seconds() - requests_per_sec = len(responses) / total_latency - tokens_generated_per_sec = total_op_tokens / total_latency - - in_flight_requests_intervals = get_inflight_reqs_profile( - df['start_time'].tolist(), df['stop_time'].tolist(), requests_per_sec) - avg_in_flight_requests = np.mean(in_flight_requests_intervals) - - print_data_dict = {} - print_data_dict["Requests/Sec"] = requests_per_sec - print_data_dict["OP tokens/sec"] = tokens_generated_per_sec - print_data_dict["Avg. latency (ms)"] = np.mean(latency_list_in_order) - print_data_dict["P99 latency (ms)"] = latency_sorted[index_99 - 1] - print_data_dict["P90 latency (ms)"] = latency_sorted[index_90 - 1] - print_data_dict["Avg. Input tokens per request"] = avg_ip_tokens - print_data_dict["Avg. Output tokens per request"] = avg_op_tokens - print_data_dict["Avg. InFlight requests"] = avg_in_flight_requests - print_data_dict["Total latency (ms)"] = total_latency * 1000 - print_data_dict["Total requests"] = len(responses) - - print_data = [["Requests/Sec", requests_per_sec], - ["OP tokens/sec", tokens_generated_per_sec], - ["Avg. latency (ms)", - np.mean(latency_list_in_order)], - ["P99 latency (ms)", latency_sorted[index_99 - 1]], - ["P90 latency (ms)", latency_sorted[index_90 - 1]], - ["Avg. IP tokens per request", avg_ip_tokens], - ["Avg. OP tokens per request", avg_op_tokens], - ["Avg. InFlight requests", avg_in_flight_requests], - ["Total latency (ms)", total_latency * 1000], - ["Total requests", len(responses)]] - - # Format numerical values to 2 decimal places - formatted_data = [[item, f"{value:.2f}"] for item, value in print_data] - headers = ["Stat", "Value"] - table = tabulate(formatted_data, headers=headers, tablefmt="pretty") - - if FLAGS.op_stats_csv is not None: - with open(".csv", "a", newline="") as file: - filednames = print_data_dict.keys() - writer = csv.DictWriter(file, fieldnames=filednames) - - # Check if the file is empty, and write the header if needed - if file.tell() == 0: - writer.writeheader() - - # Write the dictionaries as new rows - writer.writerow(print_data_dict) - - print(table) - - if FLAGS.dump_perfetto_trace: - json_dict = [] - for i in range(len(op_token_len_list)): - req_dict = {} - req_dict['name'] = 'req_{}'.format(i) - req_dict["cat"] = "batch" - req_dict["ph"] = "X" - req_dict["ts"] = (start_time_list_in_order[i].timestamp() - - t0.timestamp()) * 1000000 #perfetto expects us - req_dict["dur"] = ( - stop_time_list_in_order[i] - - start_time_list_in_order[i]).total_seconds() * 1000000 - req_dict["pid"] = "1" - req_dict["args"] = { - "isl": int(ip_token_len_list[i]), - "osl": int(op_token_len_list[i]) - } - json_dict.append(req_dict) - - with open("prfetto_dump.json", "w") as file: - json.dump(json_dict, file, indent=4) - - -def extract_string_from_nested_list(nested_list): - if isinstance(nested_list, str): - return nested_list - elif isinstance(nested_list, list): - for item in nested_list: - extracted_string = extract_string_from_nested_list(item) - if extracted_string: - return extracted_string - return ""